Commit f9552033 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

initial commit

parents
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
#else
#include "MaxPooling.h"
extern "C" void scn_DR_(MaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *output_features, long nFeaturesToDrop, void *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resize2d)(output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THTensor_(zero)(output_features);
auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
auto oF = THTensor_(data)(output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
extern "C" void scn_DR_(MaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *output_features,
THTensor *d_output_features, long nFeaturesToDrop, void *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template <typename T>
void MaxPooling_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite] * input_stride;
uInt o = rules[2 * outSite + 1] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] < input_features[i + plane])
output_features[o + plane] = input_features[i + plane];
}
}
template <typename T>
void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite] * input_stride;
uInt o = rules[2 * outSite + 1] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
}
#endif /* CPU_MAXPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
#else
extern "C" double
scn_R_(NetworkInNetwork_updateOutput)(THTensor *input_features_,
THTensor *output_features_,
THTensor *weight_, THTensor *bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THTensor_(resize2d)(output_features_, nActive, output_nPlanes);
auto input_features = THTensor_(data)(input_features_);
auto output_features = THTensor_(data)(output_features_);
auto weight = THTensor_(data)(weight_);
if (bias_ != nullptr) {
// Set bias
auto bias = THTensor_(data)(bias_);
for (uInt row = 0; row < nActive; row++)
for (uInt column = 0; column < output_nPlanes; column++)
output_features[row * output_nPlanes + column] = bias[column];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
THBlas_(gemm)('n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight, output_nPlanes, // r
input_features,
input_nPlanes, // m
1, // beta
output_features, output_nPlanes // r
);
} else {
THTensor_(zero)(output_features_);
THBlas_(gemm)('n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight, output_nPlanes, // r
input_features, input_nPlanes, // m
0, // beta
output_features, output_nPlanes // r
);
}
return nActive * input_nPlanes * output_nPlanes;
}
extern "C" void
scn_R_(NetworkInNetwork_updateGradInput)(THTensor *d_input_features_,
THTensor *d_output_features_,
THTensor *weight_) {
auto nActive = d_output_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THTensor_(resize2d)(d_input_features_, nActive, input_nPlanes);
THTensor_(zero)(d_input_features_);
auto d_input_features = THTensor_(data)(d_input_features_);
auto d_output_features = THTensor_(data)(d_output_features_);
auto weight = THTensor_(data)(weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
THBlas_(gemm)('t', 'n',
input_nPlanes, // r
nActive, // l
output_nPlanes, // m
1, // alpha
weight, output_nPlanes, // m
d_output_features, output_nPlanes, // m
0, // beta
d_input_features, input_nPlanes // r
);
}
extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
THTensor *input_features_, THTensor *d_output_features_,
THTensor *d_weight_, THTensor *d_bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = d_weight_->size[0];
auto output_nPlanes = d_weight_->size[1];
auto input_features = THTensor_(data)(input_features_);
auto d_output_features = THTensor_(data)(d_output_features_);
auto d_weight = THTensor_(data)(d_weight_);
auto d_bias = d_bias_ and THTensor_(data)(d_bias_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
THBlas_(gemm)('n', 't',
output_nPlanes, // r
input_nPlanes, // l
nActive, // m
1, // alpha
d_output_features, output_nPlanes, // r
input_features, input_nPlanes, // l
1, // beta
d_weight, output_nPlanes // r
);
if (d_bias_) {
auto d_bias = THTensor_(data)(d_bias_);
for (uInt row = 0; row < nActive; row++)
for (uInt i = 0; i < output_nPlanes; i++)
d_bias[i] += d_output_features[row * output_nPlanes + i];
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_NetworkInNetwork_H
#define CPU_NetworkInNetwork_H
#include "../SparseConvNet.h"
#include "Convolution.h"
// buffer must have size >= output_nActive * filterVolume * input_nPlanes
template <typename T>
void NetworkInNetwork_ForwardPass(
T *input_features, uInt input_nPlanes, T *output_features,
uInt output_nPlanes, T *weight, T *bias, uInt output_nActive,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
if (bias != nullptr) {
// Set bias
for (uInt row = 0; row < output_nActive; row++)
for (uInt column = 0; column < output_nPlanes; column++)
output_features[row * output_nPlanes + column] = bias[column];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
(*gemm)('n', 'n',
output_nPlanes, // r
output_nActive, // l
input_nPlanes * filterVolume, // m
1, // alpha
weight, output_nPlanes, // r
buffer,
input_nPlanes * filterVolume, // m
1, // beta
output_features, output_nPlanes // r
);
} else {
(*gemm)('n', 'n',
output_nPlanes, // r
output_nActive, // l
input_nPlanes * filterVolume, // m
1, // alpha
weight, output_nPlanes, // r
buffer, input_nPlanes * filterVolume, // m
0, // beta
output_features, output_nPlanes // r
);
}
}
template <typename T>
void NetworkInNetwork_BackwardPass(
T *d_input_features, uInt input_nPlanes, T *d_output_features,
uInt output_nPlanes, T *weight, uInt *rules, uInt filterVolume,
uInt output_nActive, T *d_buffer,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
(*gemm)('t', 'n',
input_nPlanes * filterVolume, // r
output_nActive, // l
output_nPlanes, // m
1, // alpha
weight, output_nPlanes, // m
d_output_features, output_nPlanes, // m
0, // beta
d_buffer, input_nPlanes * filterVolume // r
);
// Use rules and d_buffer to accumulate gradient information into d_input
for (uInt row = 0; row < output_nActive * filterVolume; row++) {
auto r = rules[row];
if (r != uInt_MAX) // 2^32-1
for (uInt i = 0; i < input_nPlanes; i++)
d_input_features[r * input_nPlanes + i] +=
d_buffer[row * input_nPlanes + i];
}
}
template <typename T>
void NetworkInNetwork_GradWeights(
T *input_features, uInt input_nPlanes, T *d_output_features,
uInt output_nPlanes, T *d_weight, T *d_bias, uInt *rules, uInt filterVolume,
uInt output_nActive, T *buffer,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
// d_weight
// Use input_features and rules to fill buffer
for (uInt row = 0; row < output_nActive * filterVolume; row++) {
if (rules[row] == uInt_MAX) { // 2^32-1
std::memset(buffer + row * input_nPlanes, 0, sizeof(T) * input_nPlanes);
} else {
std::memcpy(buffer + row * input_nPlanes,
input_features + rules[row] * input_nPlanes,
sizeof(T) * input_nPlanes);
}
}
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
(*gemm)('n', 't',
output_nPlanes, // r
input_nPlanes * filterVolume, // l
output_nActive, // m
1, // alpha
d_output_features, output_nPlanes, // r
buffer, input_nPlanes * filterVolume, // l
1, // beta
d_weight, output_nPlanes // r
);
if (d_bias)
for (uInt row = 0; row < output_nActive; row++)
for (uInt i = 0; i < output_nPlanes; i++)
d_bias[i] += d_output_features[row * output_nPlanes + i];
}
#endif /* CPU_NetworkInNetwork_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
#else
#include "SparseToDense.h"
extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
void **m,
THTensor *input_features,
THTensor *output_features,
void *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
long sz[Dimension + 2];
sz[0] = _m.inputSGs->size();
sz[1] = input_features->size[1];
for (int i = 0; i < Dimension; i++) {
auto x = THLongTensor_data(inputSize)[i];
sz[i + 2] = x;
}
THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
THTensor_(zero)(output_features);
}
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = input_features->size[1];
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0],
nHot);
oF += spatialVolume;
}
}
extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THLongTensor *inputSize, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *d_output_features,
void *rulesBuffer) {
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = d_input_features->size[1];
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0],
nHot);
doF += spatialVolume;
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume,
uInt* rules, int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
T *i = &input_features[rules[2 * outSite] * nPlanes];
uInt sample = rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++)
output_features[(sample*nPlanes+plane)*spatialVolume]=i[plane];
}
}
template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume,
uInt* rules, int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
T *di = &d_input_features[rules[2 * outSite] * nPlanes];
uInt sample = rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++)
di[plane]=d_output_features[(sample*nPlanes+plane)*spatialVolume];
}
}
#endif /* CPU_SPARSETODENSE_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
#endif
#define real float
#define accreal double
#define Real Float
#define TH_REAL_IS_FLOAT
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_FLOAT
#define real double
#define accreal double
#define Real Double
#define TH_REAL_IS_DOUBLE
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_DOUBLE
#undef TH_GENERIC_FILE
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
#else
#include "ActivePooling.h"
extern "C" void scn_DR_(ActivePooling_updateOutput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *output_features, THCITensor *rulesBuffer, bool average) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1];
auto _rules = _m.getActivePoolingRuleBook(inputSize);
uInt batchSize = _rules[1][0];
uInt maxActive = _rules[1][1];
THCTensor_(resize2d)(state, output_features, batchSize, nPlanes);
THCTensor_(zero)(state, output_features);
if (THCITensor_nElement(state, rulesBuffer) < 1 << 22)
THCITensor_resize1d(state, rulesBuffer, 1 << 22);
uInt *rb = (uInt *)THCITensor_data(state, rulesBuffer);
uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
THAssert(rowBatchSize > 0);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
for (uInt o = 0; o < batchSize; o += rowBatchSize) {
uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(uInt) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_ForwardPass<real>(iF, oF + o * nPlanes, batchSize_, maxActive,
nPlanes, rb, average);
}
}
extern "C" void scn_DR_(ActivePooling_updateGradInput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCITensor *rulesBuffer, bool average) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1];
auto _rules = _m.getActivePoolingRuleBook(inputSize);
uInt batchSize = _rules[1][0];
uInt maxActive = _rules[1][1];
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (THCITensor_nElement(state, rulesBuffer) < 1 << 22)
THCITensor_resize1d(state, rulesBuffer, 1 << 22);
uInt *rb = (uInt *)THCITensor_data(state, rulesBuffer);
uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
THAssert(rowBatchSize > 0);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
for (uInt o = 0; o < batchSize; o += rowBatchSize) {
uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(uInt) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_BackwardPass<real>(diF, doF + o * nPlanes, batchSize_,
maxActive, nPlanes, rb, average);
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_ACTIVEPOOLING_H
#define GPU_ACTIVEPOOLING_H
template <typename T>
__global__ void ActivePooling_fp(T *input_features, T *output_features,
uInt maxActive, uInt nPlanes, uInt *rules,
bool average) {
T *out = &output_features[blockIdx.x * nPlanes];
uInt *r = &rules[blockIdx.x * (maxActive + 1)];
uInt nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &input_features[(*r++) * nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
out[plane] += inp[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
uInt batchSize, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
ActivePooling_fp<T> << <batchSize, kernelBlockDim, 0,
THCState_getCurrentStream(state)>>>
(input_features, output_features, maxActive, nPlanes, rules, average);
}
template <typename T>
__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
uInt maxActive, uInt nPlanes, uInt *rules,
bool average) {
T *out = &d_output_features[blockIdx.x * nPlanes];
uInt *r = &rules[blockIdx.x * (maxActive + 1)];
uInt nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
inp[plane] = out[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
uInt batchSize, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
ActivePooling_bp<T> << <batchSize, kernelBlockDim, 0,
THCState_getCurrentStream(state)>>>
(d_input_features, d_output_features, maxActive, nPlanes, rules, average);
}
#endif /* GPU_ActivePOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
#else
#include "AffineReluTrivialConvolution.h"
#include <algorithm>
#include <iostream>
extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
THCTensor *input_features, THCTensor *output_features,
THCTensor *affineWeight, THCTensor *affineBias, THCTensor *convWeight) {
THCTensor_(resize2d)(state, output_features, input_features->size[0],
convWeight->size[1]);
dAffineReluTrivialConvolution_forward<real>(
THCTensor_(data)(state, input_features),
THCTensor_(data)(state, output_features),
THCTensor_(data)(state, affineWeight),
THCTensor_(data)(state, affineBias), THCTensor_(data)(state, convWeight),
convWeight->size[0], input_features->stride[0], convWeight->size[1],
output_features->size[1], input_features->size[0]);
}
extern "C" void scn_R_(AffineReluTrivialConvolution_backward)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *affineWeight,
THCTensor *d_affineWeight, THCTensor *affineBias, THCTensor *d_affineBias,
THCTensor *convWeight, THCTensor *d_convWeight, bool additiveGrad) {
THCTensor_(resizeAs)(state, d_input_features, input_features);
dAffineReluTrivialConvolution_backward_dW<real>(
THCTensor_(data)(state, input_features),
THCTensor_(data)(state, d_input_features),
THCTensor_(data)(state, d_output_features),
THCTensor_(data)(state, affineWeight),
THCTensor_(data)(state, d_affineWeight),
THCTensor_(data)(state, affineBias),
THCTensor_(data)(state, d_affineBias),
THCTensor_(data)(state, convWeight),
THCTensor_(data)(state, d_convWeight), convWeight->size[0],
input_features->stride[0], convWeight->size[1],
d_output_features->stride[0], input_features->size[0], additiveGrad);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template <typename T, uInt K, uInt V>
__global__ void dAffineReluTrivialConvolution_forwardA(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive) {
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, uInt K, uInt V>
__global__ void dAffineReluTrivialConvolution_forwardB(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nActive) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T>
void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes,
uInt input_stride,
uInt output_nPlanes,
uInt output_stride, uInt nActive) {
{
const uInt K = 64;
const uInt V = 16;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 32;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 16;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 8;
const uInt V = 2;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
assert(false);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
__global__ void dAffineReluTrivialConvolution_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
__global__ void dAffineReluTrivialConvolution_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
} else {
i[v] = 0;
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
}
template <typename T>
void dAffineReluTrivialConvolution_backward_dW(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
{
const uInt K = 32;
const uInt V = 8;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
{
const uInt K = 16;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
{
const uInt K = 8;
const uInt V = 2;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
#else
#include "AveragePooling.h"
#include "RuleBookIterator.h"
extern "C" void scn_DR_(AveragePooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(AveragePooling_ForwardPass<real>(
THCState_getCurrentStream(state), iF, oF, nPlanes,
input_features->size[1], output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
extern "C" void scn_DR_(AveragePooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
long nFeaturesToDrop, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(AveragePooling_BackwardPass<real>(
THCState_getCurrentStream(state), diF, doF, nPlanes,
input_features->size[1], d_output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AVERAGEPOOLING_H
#define GPU_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
__global__ void AveragePooling_fp(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot,
T alpha) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
atomicAdd(&output_features[o + plane],
alpha * input_features[i + plane]);
}
__syncthreads();
}
}
template <typename T>
void AveragePooling_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
template <typename T, uInt NTX, uInt NTY>
__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot,
T alpha) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += alpha * d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void AveragePooling_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
d_input_features, d_output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
#endif /* GPU_AVERAGEPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
#else
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train, \
leakiness); \
}
extern "C" void scn_R_(BatchNormalization_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
real leakiness) {
THCTensor_(resizeAs)(state, output_features, input_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
real leakiness) {
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, d_output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, \
d_weight ? THCTensor_(data)(state, d_weight) : 0, \
d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness); \
}
extern "C" void scn_R_(BatchNormalization_backward)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *output_features, THCTensor *d_output_features,
THCTensor *saveMean, THCTensor *saveInvStd, THCTensor *runningMean,
THCTensor *runningVar, THCTensor *weight, THCTensor *bias,
THCTensor *d_weight, THCTensor *d_bias, real leakiness) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHNORMALIZATION_H
#define GPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template <typename T, uInt NTX, uInt NTY>
__global__ void
BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, c = plane + threadIdx.y * input_stride;
row < nActive; row += NTY, c += input_stride * NTY) {
T i = input_features[c];
t[threadIdx.y][threadIdx.x] += i;
t2[threadIdx.y][threadIdx.x] += i * i;
}
__syncthreads();
T _saveMean = 0;
T _saveInvStd = 0;
for (uInt row = 0; row < NTY; row++) {
_saveMean += t[row][threadIdx.x];
_saveInvStd += t2[row][threadIdx.x];
}
_saveMean /= nActive;
_saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
if (threadIdx.y == 0) {
saveMean[plane] = _saveMean;
runningMean[plane] =
momentum * runningMean[plane] + (1 - momentum) * _saveMean;
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * _saveInvStd / (nActive - 1);
}
_saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
if (threadIdx.y == 0)
saveInvStd[plane] = _saveInvStd;
__syncthreads();
if (threadIdx.y == 0) {
t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
t[1][threadIdx.x] =
-_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
}
__syncthreads();
T W = t[0][threadIdx.x];
T B = t[1][threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = W * input_features[ci] + B;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
__global__ void
BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T W[NTX];
__shared__ T B[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
W[threadIdx.x] =
pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
B[threadIdx.x] =
(bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
}
__syncthreads();
float w = W[threadIdx.x], b = B[threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = w * input_features[ci] + b;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
BatchNormalization_f_train<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
THCState_getCurrentStream(state)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
} else {
BatchNormalization_f_test<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
THCState_getCurrentStream(state)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
}
}
template <typename T, uInt NTX, uInt NTY>
__global__ void
BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive, T *saveMean,
T *saveInvStd, T *runningMean, T *runningVar, T *weight,
T *bias, T *d_weight, T *d_bias, T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
t[0][threadIdx.x] = saveMean[plane];
t[1][threadIdx.x] = saveInvStd[plane];
t[2][threadIdx.x] = (weight ? weight[plane] : 1);
}
__syncthreads();
T _saveMean = t[0][threadIdx.x];
T _saveInvStd = t[1][threadIdx.x];
T _weight = t[2][threadIdx.x];
__syncthreads();
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
t[threadIdx.y][threadIdx.x] += d;
t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
}
__syncthreads();
T gradMean = 0;
T dotp = 0;
for (int row = 0; row < NTY; row++) {
gradMean += t[row][threadIdx.x];
dotp += t2[row][threadIdx.x];
}
__syncthreads();
if (d_weight)
d_weight[plane] = dotp * _saveInvStd;
if (d_bias)
d_bias[plane] = gradMean; // sum really
gradMean /= nActive;
T k = dotp * _saveInvStd * _saveInvStd / nActive;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
d_input_features[ci] = (d_output_features[co] - gradMean -
(input_features[ci] - _saveMean) * k) *
_saveInvStd * _weight;
}
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
BatchNormalization_b<T, NTX,
NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY),
0, THCState_getCurrentStream(state)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
}
#undef NTX
#undef NTY
#endif /* GPU_BATCHNORMALIZATION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *noise,
float alpha) {
if (input_features != output_features)
THCTensor_(resizeAs)(state, output_features, input_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64)
SPARSECONVNET_FOO(12, 64)
SPARSECONVNET_FOO(8, 64)
SPARSECONVNET_FOO(4, 64)
SPARSECONVNET_FOO(1, 64)
}
#undef SPARSECONVNET_FOO
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *noise, float alpha) {
if (d_input_features != d_output_features)
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64)
SPARSECONVNET_FOO(12, 64)
SPARSECONVNET_FOO(8, 64)
SPARSECONVNET_FOO(4, 64)
SPARSECONVNET_FOO(1, 64)
}
#undef SPARSECONVNET_FOO
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, uInt NTX, uInt NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise,
uInt nActive, uInt nPlanes,
uInt input_stride,
uInt output_stride, T alpha) {
__shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
uInt i = row * input_stride + plane;
uInt o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
__global__ void
BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
T *d_output_features, T *noise, uInt nActive,
uInt nPlanes, uInt input_stride,
uInt output_stride, T alpha) {
__shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
uInt i = row * input_stride + plane;
uInt o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
#endif /* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
#else
#include "Convolution.h"
#include "RuleBookIterator.h"
#include <algorithm>
#include <cstring>
extern "C" double scn_DR_(Convolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
return flops;
}
extern "C" void scn_DR_(Convolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
op, op, THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
extern "C" double scn_DR_(ValidConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0];
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
return flops;
}
extern "C" void scn_DR_(ValidConvolution_backward)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0];
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
op, op, THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
#endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment