Commit 2c4ed608 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Goodbye THNN. Hello ATen!

parent 6d4475db
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_Deconvolution_H
#define CPU_Deconvolution_H
#include "../SparseConvNet.h"
#include <cstring>
// buffer must have size >= nHot * (nIn+nOut)
template <typename T>
void Deconvolution_ForwardPass(
T *input_features, uInt input_nPlanes, uInt input_nPLANES,
T *output_features, uInt output_nPlanes, uInt output_nPLANES, T *weight,
T *bias, RuleBook &rules, uInt output_nActive,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
if (bias != nullptr) // Set bias
for (uInt row = 0; row < output_nActive; row++)
for (uInt column = 0; column < output_nPlanes; column++)
output_features[row * output_nPLANES + column] = bias[column];
std::vector<T> input_buffer, output_buffer;
for (auto &r : rules) {
uInt nHot = r.size() / 2;
input_buffer.resize(nHot * input_nPlanes);
output_buffer.resize(nHot * output_nPlanes);
for (uInt row = 0; row < nHot; row++)
std::memcpy(&input_buffer[row * input_nPlanes],
input_features + r[2 * row + 1] * input_nPLANES,
sizeof(T) * input_nPlanes);
// Do GEMM (note: gemm assumes column-major matrices)
// input_buffer is l*m (row-major)
// weight is m*r (row-major)
// output_buffer is l*r (row-major)
// buffer * weights -> output_buffers
(*gemm)('n', 'n',
output_nPlanes, // r
nHot, // l
input_nPlanes, // m
1, // alpha
weight, output_nPlanes, // r
&input_buffer[0], input_nPlanes, // m
0, // beta
&output_buffer[0], output_nPlanes // r
);
weight += input_nPlanes * output_nPlanes;
for (uInt row = 0; row < nHot; row++) {
T *b = &output_buffer[row * output_nPlanes];
T *o = &output_features[r[2 * row] * output_nPLANES];
for (uInt k = 0; k < output_nPlanes; k++)
o[k] += b[k];
}
}
}
template <typename T>
void Deconvolution_BackwardPass(
T *input_features, T *d_input_features, uInt input_nPlanes,
uInt input_nPLANES, T *d_output_features, uInt output_nPlanes,
uInt output_nPLANES, T *weight, T *d_weight, T *d_bias, RuleBook &rules,
uInt output_nActive,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
if (d_bias)
for (uInt row = 0; row < output_nActive; row++)
for (uInt i = 0; i < output_nPlanes; i++)
d_bias[i] += d_output_features[row * output_nPLANES + i];
std::vector<T> input_buffer, output_buffer;
for (auto &r : rules) {
uInt nHot = r.size() / 2;
input_buffer.resize(nHot * input_nPlanes);
output_buffer.resize(nHot * output_nPlanes);
for (uInt row = 0; row < nHot; row++)
std::memcpy(&output_buffer[row * output_nPlanes],
&d_output_features[r[2 * row] * output_nPLANES],
sizeof(T) * output_nPlanes);
// Do GEMM (note: gemm assumes column-major matrices)
// output_buffer is l*m (row-major)
// weights is r*m (row-major)
// input_buffer is l*r (row-major)
// output_buffer * T(weight) -> input_buffer
(*gemm)('t', 'n',
input_nPlanes, // r
nHot, // l
output_nPlanes, // m
1, // alpha
weight, output_nPlanes, // m
&output_buffer[0], output_nPlanes, // m
0, // beta
&input_buffer[0], input_nPlanes // r
);
weight += input_nPlanes * output_nPlanes;
for (uInt row = 0; row < nHot; row++) {
T *b = &input_buffer[row * input_nPlanes];
T *i = &d_input_features[r[2 * row + 1] * input_nPLANES];
for (uInt k = 0; k < input_nPlanes; k++)
i[k] += b[k];
}
for (uInt row = 0; row < nHot; row++)
std::memcpy(&input_buffer[row * input_nPlanes],
input_features + r[2 * row + 1] * input_nPLANES,
sizeof(T) * input_nPlanes);
// Do GEMM (note: gemm assumes column-major matrices)
// input_buffer is m*l (row-major)
// output_buffer is m*r (row-major)
// d_weights is l*r (row-major)
// T(input_buffer) * output_buffer -> d_weight
(*gemm)('n', 't',
output_nPlanes, // r
input_nPlanes, // l
nHot, // m
1, // alpha
&output_buffer[0], output_nPlanes, // r
&input_buffer[0], input_nPlanes, // l
1, // beta
d_weight, output_nPlanes // r
);
d_weight += input_nPlanes * output_nPlanes;
}
}
#endif /* CPU_Deconvolution_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/IOLayers.cpp"
#else
#include "IOLayers.h"
extern "C" void scn_DR_(InputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THTensor *input_features, THTensor *output_features, long batchSize,
long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.inputLayer(spatialSize, input_coords, batchSize, mode);
auto nPlanes = input_features->size[1];
auto &rules = _m.inputLayerRuleBook;
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THTensor_(resizeAs)(output_features, input_features);
THTensor_(copy)(output_features, input_features);
} else {
THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
THTensor_(zero)(output_features);
InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
THTensor_(data)(output_features), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
extern "C" void scn_DR_(InputLayer_updateGradInput)(void **m,
THTensor *d_input_features,
THTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
auto nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THTensor_(resizeAs)(d_input_features, d_output_features);
THTensor_(copy)(d_input_features, d_output_features);
} else {
THTensor_(resize2d)(d_input_features, rules[0][2], nPlanes);
THTensor_(zero)(d_input_features);
InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
THTensor_(data)(d_output_features), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
THTensor *input_features,
THTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
auto nPlanes = input_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THTensor_(resizeAs)(output_features, input_features);
THTensor_(copy)(output_features, input_features);
} else {
THTensor_(resize2d)(output_features, rules[0][2], nPlanes);
THTensor_(zero)(output_features);
InputLayer_BackwardPass<real>(THTensor_(data)(output_features),
THTensor_(data)(input_features), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
extern "C" void
scn_DR_(OutputLayer_updateGradInput)(void **m, THTensor *d_input_features,
THTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
auto nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THTensor_(resizeAs)(d_input_features, d_output_features);
THTensor_(copy)(d_input_features, d_output_features);
} else {
THTensor_(resize2d)(d_input_features, nRows, nPlanes);
THTensor_(zero)(d_input_features);
InputLayer_ForwardPass<real>(THTensor_(data)(d_output_features),
THTensor_(data)(d_input_features), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
extern "C" void scn_DR_(BLInputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THTensor *input_features, THTensor *output_features, long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.blLayer(spatialSize, input_coords, mode);
auto nPlanes = input_features->size[2];
auto &rules = _m.blLayerRuleBook;
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
THTensor_(resizeAs)(output_features, input_features);
THTensor_(copy)(output_features, input_features);
THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
} else {
THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
THTensor_(zero)(output_features);
InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
THTensor_(data)(output_features), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
extern "C" void
scn_DR_(BLInputLayer_updateGradInput)(void **m, THTensor *d_input_features,
THTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
auto nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
THTensor_(resizeAs)(d_input_features, d_output_features);
THTensor_(copy)(d_input_features, d_output_features);
THTensor_(resize3d)(d_input_features, rules[0][2], rules[0][3], nPlanes);
} else {
THTensor_(resize3d)(d_input_features, rules[0][2], rules[0][3], nPlanes);
THTensor_(zero)(d_input_features);
InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
THTensor_(data)(d_output_features), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
THTensor *input_features,
THTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
auto nPlanes = input_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
THTensor_(resizeAs)(output_features, input_features);
THTensor_(copy)(output_features, input_features);
THTensor_(resize3d)(output_features, rules[0][2], rules[0][3], nPlanes);
} else {
THTensor_(resize3d)(output_features, rules[0][2], rules[0][3], nPlanes);
THTensor_(zero)(output_features);
InputLayer_BackwardPass<real>(THTensor_(data)(output_features),
THTensor_(data)(input_features), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
extern "C" void
scn_DR_(BLOutputLayer_updateGradInput)(void **m, THTensor *d_input_features,
THTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
auto nPlanes = d_output_features->size[2];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
THTensor_(resizeAs)(d_input_features, d_output_features);
THTensor_(copy)(d_input_features, d_output_features);
THTensor_(resize2d)(d_input_features, nRows, nPlanes);
} else {
THTensor_(resize2d)(d_input_features, nRows, nPlanes);
THTensor_(zero)(d_input_features);
InputLayer_ForwardPass<real>(THTensor_(data)(d_output_features),
THTensor_(data)(d_input_features), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/CPU/LeakyReLU.cpp"
#else
extern "C" void scn_R_(LeakyReLU_updateOutput)(THTensor *input_features,
THTensor *output_features,
float alpha) {
if (input_features != output_features)
THTensor_(resizeAs)(output_features, input_features);
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
auto n = THTensor_(nElement)(input_features);
for (uInt i = 0; i < n; i++)
oF[i] = (iF[i] > 0) ? iF[i] : iF[i] * alpha;
}
extern "C" void scn_R_(LeakyReLU_updateGradInput)(THTensor *input_features,
THTensor *d_input_features,
THTensor *d_output_features,
float alpha) {
if (d_input_features != d_output_features)
THTensor_(resizeAs)(d_input_features, d_output_features);
auto iF = THTensor_(data)(input_features);
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
auto n = THTensor_(nElement)(d_input_features);
for (uInt i = 0; i < n; i++)
diF[i] = (iF[i] > 0) ? doF[i] : doF[i] * alpha;
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
#else
#include "MaxPooling.h"
extern "C" void scn_DR_(MaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resize2d)(output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THTensor_(zero)(output_features);
auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
auto oF = THTensor_(data)(output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
extern "C" void scn_DR_(MaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *output_features,
THTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resize2d)(output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THTensor_(zero)(output_features);
auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
auto oF = THTensor_(data)(output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *output_features,
THTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
input_features->stride[0],
output_features->stride[0], &r[0], nHot);
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
#else
extern "C" double
scn_R_(NetworkInNetwork_updateOutput)(THTensor *input_features_,
THTensor *output_features_,
THTensor *weight_, THTensor *bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THTensor_(resize2d)(output_features_, nActive, output_nPlanes);
auto input_features = THTensor_(data)(input_features_);
auto output_features = THTensor_(data)(output_features_);
auto weight = THTensor_(data)(weight_);
if (bias_ != nullptr) {
// Set bias
auto bias = THTensor_(data)(bias_);
for (uInt row = 0; row < nActive; row++)
for (uInt column = 0; column < output_nPlanes; column++)
output_features[row * output_nPlanes + column] = bias[column];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
THBlas_(gemm)('n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight, output_nPlanes, // r
input_features,
input_nPlanes, // m
1, // beta
output_features, output_nPlanes // r
);
} else {
THTensor_(zero)(output_features_);
THBlas_(gemm)('n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight, output_nPlanes, // r
input_features, input_nPlanes, // m
0, // beta
output_features, output_nPlanes // r
);
}
return nActive * input_nPlanes * output_nPlanes;
}
extern "C" void
scn_R_(NetworkInNetwork_updateGradInput)(THTensor *d_input_features_,
THTensor *d_output_features_,
THTensor *weight_) {
auto nActive = d_output_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THTensor_(resize2d)(d_input_features_, nActive, input_nPlanes);
THTensor_(zero)(d_input_features_);
auto d_input_features = THTensor_(data)(d_input_features_);
auto d_output_features = THTensor_(data)(d_output_features_);
auto weight = THTensor_(data)(weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
THBlas_(gemm)('t', 'n',
input_nPlanes, // r
nActive, // l
output_nPlanes, // m
1, // alpha
weight, output_nPlanes, // m
d_output_features, output_nPlanes, // m
0, // beta
d_input_features, input_nPlanes // r
);
}
extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
THTensor *input_features_, THTensor *d_output_features_,
THTensor *d_weight_, THTensor *d_bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = d_weight_->size[0];
auto output_nPlanes = d_weight_->size[1];
auto input_features = THTensor_(data)(input_features_);
auto d_output_features = THTensor_(data)(d_output_features_);
auto d_weight = THTensor_(data)(d_weight_);
auto d_bias = d_bias_ and THTensor_(data)(d_bias_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
THBlas_(gemm)('n', 't',
output_nPlanes, // r
input_nPlanes, // l
nActive, // m
1, // alpha
d_output_features, output_nPlanes, // r
input_features, input_nPlanes, // l
1, // beta
d_weight, output_nPlanes // r
);
if (d_bias_) {
auto d_bias = THTensor_(data)(d_bias_);
for (uInt row = 0; row < nActive; row++)
for (uInt i = 0; i < output_nPlanes; i++)
d_bias[i] += d_output_features[row * output_nPlanes + i];
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_NetworkInNetwork_H
#define CPU_NetworkInNetwork_H
#include "../SparseConvNet.h"
#include "Convolution.h"
// buffer must have size >= output_nActive * filterVolume * input_nPlanes
template <typename T>
void NetworkInNetwork_ForwardPass(
T *input_features, uInt input_nPlanes, T *output_features,
uInt output_nPlanes, T *weight, T *bias, uInt output_nActive,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
if (bias != nullptr) {
// Set bias
for (uInt row = 0; row < output_nActive; row++)
for (uInt column = 0; column < output_nPlanes; column++)
output_features[row * output_nPlanes + column] = bias[column];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
(*gemm)('n', 'n',
output_nPlanes, // r
output_nActive, // l
input_nPlanes * filterVolume, // m
1, // alpha
weight, output_nPlanes, // r
buffer,
input_nPlanes * filterVolume, // m
1, // beta
output_features, output_nPlanes // r
);
} else {
(*gemm)('n', 'n',
output_nPlanes, // r
output_nActive, // l
input_nPlanes * filterVolume, // m
1, // alpha
weight, output_nPlanes, // r
buffer, input_nPlanes * filterVolume, // m
0, // beta
output_features, output_nPlanes // r
);
}
}
template <typename T>
void NetworkInNetwork_BackwardPass(
T *d_input_features, uInt input_nPlanes, T *d_output_features,
uInt output_nPlanes, T *weight, uInt *rules, uInt filterVolume,
uInt output_nActive, T *d_buffer,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
(*gemm)('t', 'n',
input_nPlanes * filterVolume, // r
output_nActive, // l
output_nPlanes, // m
1, // alpha
weight, output_nPlanes, // m
d_output_features, output_nPlanes, // m
0, // beta
d_buffer, input_nPlanes * filterVolume // r
);
// Use rules and d_buffer to accumulate gradient information into d_input
for (uInt row = 0; row < output_nActive * filterVolume; row++) {
auto r = rules[row];
if (r != uInt_MAX) // 2^32-1
for (uInt i = 0; i < input_nPlanes; i++)
d_input_features[r * input_nPlanes + i] +=
d_buffer[row * input_nPlanes + i];
}
}
template <typename T>
void NetworkInNetwork_GradWeights(
T *input_features, uInt input_nPlanes, T *d_output_features,
uInt output_nPlanes, T *d_weight, T *d_bias, uInt *rules, uInt filterVolume,
uInt output_nActive, T *buffer,
void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
// d_weight
// Use input_features and rules to fill buffer
for (uInt row = 0; row < output_nActive * filterVolume; row++) {
if (rules[row] == uInt_MAX) { // 2^32-1
std::memset(buffer + row * input_nPlanes, 0, sizeof(T) * input_nPlanes);
} else {
std::memcpy(buffer + row * input_nPlanes,
input_features + rules[row] * input_nPlanes,
sizeof(T) * input_nPlanes);
}
}
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
(*gemm)('n', 't',
output_nPlanes, // r
input_nPlanes * filterVolume, // l
output_nActive, // m
1, // alpha
d_output_features, output_nPlanes, // r
buffer, input_nPlanes * filterVolume, // l
1, // beta
d_weight, output_nPlanes // r
);
if (d_bias)
for (uInt row = 0; row < output_nActive; row++)
for (uInt i = 0; i < output_nPlanes; i++)
d_bias[i] += d_output_features[row * output_nPlanes + i];
}
#endif /* CPU_NetworkInNetwork_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
#else
#include "SparseToDense.h"
extern "C" void scn_DR_(SparseToDense_updateOutput)(
THLongTensor *inputSize, void **m, THTensor *input_features,
THTensor *output_features, long nPlanes) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
{
long sz[Dimension + 2];
sz[0] = _m.grids.begin()->second.size(); //batch size
sz[1] = nPlanes;
std::memcpy(sz + 2, THLongTensor_data(inputSize), sizeof(long) * Dimension);
THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
THTensor_(zero)(output_features);
}
if (input_features->nDimension == 2) {
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
uInt _nPlanes = input_features->size[1];
auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features);
long spatialVolume = THLongTensor_prodall(inputSize);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
SparseToDense_ForwardPass<real>(iF, oF, _nPlanes, spatialVolume, &r[0],
nHot);
oF += _nPlanes * spatialVolume;
}
}
}
extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THLongTensor *inputSize, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
if (input_features->nDimension == 2) {
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
long spatialVolume = THLongTensor_prodall(inputSize);
uInt _nPlanes = d_input_features->size[1];
auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
SparseToDense_BackwardPass<real>(diF, doF, _nPlanes, spatialVolume, &r[0],
nHot);
doF += _nPlanes * spatialVolume;
}
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
#endif
#define real float
#define accreal double
#define Real Float
#define TH_REAL_IS_FLOAT
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_FLOAT
#define real double
#define accreal double
#define Real Double
#define TH_REAL_IS_DOUBLE
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_DOUBLE
#undef TH_GENERIC_FILE
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/UnPooling.cpp"
#else
#include "UnPooling.h"
extern "C" void scn_DR_(UnPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resize2d)(output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THTensor_(zero)(output_features);
auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
auto oF = THTensor_(data)(output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
UnPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->size[1],
output_features->size[1], &r[0], nHot,
_rules.size());
}
}
extern "C" void scn_DR_(UnPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THTensor *input_features,
THTensor *d_input_features, THTensor *d_output_features,
long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features);
auto diF = THTensor_(data)(d_input_features) + nFeaturesToDrop;
auto doF = THTensor_(data)(d_output_features);
for (auto &r : _rules) {
uInt nHot = r.size() / 2;
UnPooling_BackwardPass<real>(diF, doF, nPlanes, input_features->size[1],
d_output_features->size[1], &r[0], nHot,
_rules.size());
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
#else
#include "ActivePooling.h"
extern "C" void scn_DR_(ActivePooling_updateOutput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *output_features, bool average) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1];
auto _rules = _m.getActivePoolingRuleBook(inputSize);
uInt batchSize = _rules[1][0];
uInt maxActive = _rules[1][1];
THCTensor_(resize2d)(state, output_features, batchSize, nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
if (THCITensor_(nElement)(state, rulesBuffer) < 1 << 22)
THCITensor_(resize1d)(state, rulesBuffer, 1 << 22);
uInt *rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
THAssert(rowBatchSize > 0);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
for (uInt o = 0; o < batchSize; o += rowBatchSize) {
uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(uInt) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_ForwardPass<real>(iF, oF + o * nPlanes, batchSize_, maxActive,
nPlanes, rb, average);
}
THCITensor_(free)(state, rulesBuffer);
}
extern "C" void scn_DR_(ActivePooling_updateGradInput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
bool average) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1];
auto _rules = _m.getActivePoolingRuleBook(inputSize);
uInt batchSize = _rules[1][0];
uInt maxActive = _rules[1][1];
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
if (THCITensor_(nElement)(state, rulesBuffer) < 1 << 22)
THCITensor_(resize1d)(state, rulesBuffer, 1 << 22);
uInt *rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
THAssert(rowBatchSize > 0);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
for (uInt o = 0; o < batchSize; o += rowBatchSize) {
uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(uInt) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_BackwardPass<real>(diF, doF + o * nPlanes, batchSize_,
maxActive, nPlanes, rb, average);
}
THCITensor_(free)(state, rulesBuffer);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
#else
#include "AffineReluTrivialConvolution.h"
#include <algorithm>
extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
THCTensor *input_features, THCTensor *output_features,
THCTensor *affineWeight, THCTensor *affineBias, THCTensor *convWeight) {
THCTensor_(resize2d)(state, output_features, input_features->size[0],
convWeight->size[1]);
dAffineReluTrivialConvolution_forward<real>(
THCTensor_(data)(state, input_features),
THCTensor_(data)(state, output_features),
THCTensor_(data)(state, affineWeight),
THCTensor_(data)(state, affineBias), THCTensor_(data)(state, convWeight),
convWeight->size[0], input_features->stride[0], convWeight->size[1],
output_features->size[1], input_features->size[0]);
}
extern "C" void scn_R_(AffineReluTrivialConvolution_backward)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *affineWeight,
THCTensor *d_affineWeight, THCTensor *affineBias, THCTensor *d_affineBias,
THCTensor *convWeight, THCTensor *d_convWeight, bool additiveGrad) {
THCTensor_(resizeAs)(state, d_input_features, input_features);
dAffineReluTrivialConvolution_backward_dW<real>(
THCTensor_(data)(state, input_features),
THCTensor_(data)(state, d_input_features),
THCTensor_(data)(state, d_output_features),
THCTensor_(data)(state, affineWeight),
THCTensor_(data)(state, d_affineWeight),
THCTensor_(data)(state, affineBias),
THCTensor_(data)(state, d_affineBias),
THCTensor_(data)(state, convWeight),
THCTensor_(data)(state, d_convWeight), convWeight->size[0],
input_features->stride[0], convWeight->size[1],
d_output_features->stride[0], input_features->size[0], additiveGrad);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
#else
#include "AveragePooling.h"
#include "RuleBookIterator.h"
extern "C" void scn_DR_(AveragePooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(AveragePooling_ForwardPass<real>(
THCState_getCurrentStream(state), iF, oF, nPlanes,
input_features->size[1], output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
extern "C" void scn_DR_(AveragePooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(AveragePooling_BackwardPass<real>(
THCState_getCurrentStream(state), diF, doF, nPlanes,
input_features->size[1], d_output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
#else
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train, \
leakiness); \
}
extern "C" void scn_R_(BatchNormalization_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
real leakiness) {
THCTensor_(resizeAs)(state, output_features, input_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
real leakiness) {
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, d_output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, \
d_weight ? THCTensor_(data)(state, d_weight) : 0, \
d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness); \
}
extern "C" void scn_R_(BatchNormalization_backward)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *output_features, THCTensor *d_output_features,
THCTensor *saveMean, THCTensor *saveInvStd, THCTensor *runningMean,
THCTensor *runningVar, THCTensor *weight, THCTensor *bias,
THCTensor *d_weight, THCTensor *d_bias, real leakiness) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0];
BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
#else
#include "Convolution.h"
#include "RuleBookIterator.h"
#include <algorithm>
#include <cstring>
extern "C" double scn_DR_(Convolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(Convolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
THCTensor *bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
uInt nActive = _m.getNActive(inputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
uInt nActive = _m.getNActive(inputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
extern "C" double scn_DR_(FullConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **mIn, void **mOut,
THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
auto _rules = _mIn.getFullConvolutionRuleBook(
inputSize, outputSize, filterSize, filterStride, _mOut);
uInt nActive = _mOut.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(FullConvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **mIn, void **mOut,
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
auto _rules = _mIn.getFullConvolutionRuleBook(
inputSize, outputSize, filterSize, filterStride, _mOut);
uInt nActive = _mOut.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
extern "C" double scn_DR_(RandomizedStrideConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride,
void **m, THCTensor *input_features, THCTensor *output_features,
THCTensor *weight, THCTensor *bias, long filterVolume,
THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(RandomizedStrideConvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride,
void **m, THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
#else
#include "Convolution.h"
#include "Deconvolution.h"
#include <algorithm>
extern "C" double scn_DR_(Deconvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dDeconvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
return flops;
}
extern "C" void scn_DR_(Deconvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dDeconvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/IOLayers.cu"
#else
#include "IOLayers.h"
extern "C" void scn_DR_(InputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THCTensor *input_features, THCTensor *output_features, long batchSize,
long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.inputLayer(spatialSize, input_coords, batchSize, mode);
uInt nPlanes = input_features->size[1];
auto &rules = _m.inputLayerRuleBook;
uInt maxActive = rules[0][1];
uInt nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
} else {
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(InputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
} else {
THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
THCTensor *input_features,
THCTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = input_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
} else {
THCTensor_(resize2d)(state, output_features, rules[0][2], nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(OutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
} else {
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(BLInputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THCTensor *input_features, THCTensor *output_features, long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.blLayer(spatialSize, input_coords, mode);
uInt nPlanes = input_features->size[2];
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto &rules = _m.blLayerRuleBook;
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
} else {
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
uInt mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
nPlanes);
} else {
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
THCTensor *input_features,
THCTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = input_features->size[1];
auto mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
nPlanes);
} else {
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(BLOutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = d_output_features->size[2];
uInt mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
} else {
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
#else
#include "LeakyReLU.h"
extern "C" void scn_R_(LeakyReLU_updateOutput)(THCTensor *input_features,
THCTensor *output_features,
float alpha) {
if (input_features != output_features)
THCTensor_(resizeAs)(state, output_features, input_features);
auto n = THCTensor_(nElement)(state, input_features);
LeakyReLU_fp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
(THCTensor_(data)(state, input_features),
THCTensor_(data)(state, output_features), n, alpha);
}
extern "C" void scn_R_(LeakyReLU_updateGradInput)(THCTensor *input_features,
THCTensor *d_input_features,
THCTensor *d_output_features,
float alpha) {
if (d_input_features != d_output_features)
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
auto n = THCTensor_(nElement)(state, d_input_features);
LeakyReLU_bp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
(THCTensor_(data)(state, input_features),
THCTensor_(data)(state, d_input_features),
THCTensor_(data)(state, d_output_features), n, alpha);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
#else
#include "MaxPooling.h"
#include "RuleBookIterator.h"
extern "C" void scn_DR_(MaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(
MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
nPlanes, input_features->size[1],
output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(MaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *output_features,
THCTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(
MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
oF, doF, nPlanes, input_features->size[1],
d_output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(
MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
nPlanes, input_features->size[1],
output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *output_features,
THCTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(
MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
oF, doF, nPlanes, input_features->size[1],
d_output_features->size[1], rbB, nHotB);
, )
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
#else
#include "Convolution.h"
#include <algorithm>
extern "C" double
scn_R_(NetworkInNetwork_updateOutput)(THCTensor *input_features_,
THCTensor *output_features_,
THCTensor *weight_, THCTensor *bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THCTensor_(resize2d)(state, output_features_, nActive, output_nPlanes);
auto input_features = THCTensor_(data)(state, input_features_);
auto output_features = THCTensor_(data)(state, output_features_);
auto weight = THCTensor_(data)(state, weight_);
if (bias_ != nullptr) {
auto bias = THCTensor_(data)(state, bias_);
for (uInt i = 0; i < output_nPlanes; i += 32) {
uInt blockDim = min(32L, output_nPlanes - i);
uInt gridDim = min(4096L, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
output_features + i, bias + i, output_nPlanes, output_nPlanes,
nActive);
}
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is m*r (row-major)
// output_features is l*r (row-major)
// buffer * weights + bias -> output_features
THBLAS_GEMM(state, 'n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // r
input_features,
input_nPlanes, // m
1, // beta
output_features,
output_nPlanes // r
);
} else {
THCTensor_(zero)(state, output_features_);
THBLAS_GEMM(state, 'n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // r
input_features,
input_nPlanes, // m
0, // beta
output_features,
output_nPlanes // r
);
}
return nActive * input_nPlanes * output_nPlanes;
}
extern "C" void
scn_R_(NetworkInNetwork_updateGradInput)(THCTensor *d_input_features_,
THCTensor *d_output_features_,
THCTensor *weight_) {
auto nActive = d_output_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THCTensor_(resize2d)(state, d_input_features_, nActive, input_nPlanes);
THCTensor_(zero)(state, d_input_features_);
auto d_input_features = THCTensor_(data)(state, d_input_features_);
auto d_output_features = THCTensor_(data)(state, d_output_features_);
auto weight = THCTensor_(data)(state, weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is r*m (row-major)
// d_buffer is l*r (row-major)
// d_output_features * T(weight) -> d_buffer
THBLAS_GEMM(state, 't', 'n',
input_nPlanes, // r
nActive, // l
output_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // m
d_output_features,
output_nPlanes, // m
0, // beta
d_input_features,
input_nPlanes // r
);
}
extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
THCTensor *input_features_, THCTensor *d_output_features_,
THCTensor *d_weight_, THCTensor *d_bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = d_weight_->size[0];
auto output_nPlanes = d_weight_->size[1];
auto input_features = THCTensor_(data)(state, input_features_);
auto d_output_features = THCTensor_(data)(state, d_output_features_);
auto d_weight = THCTensor_(data)(state, d_weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is m*l (row-major)
// d_output_features is m*r (row-major)
// weights is l*r (row-major)
// T(buffer) * d_output_features -> d_weight
THBLAS_GEMM(state, 'n', 't',
output_nPlanes, // r
input_nPlanes, // l
nActive, // m
1, // alpha
d_output_features,
output_nPlanes, // r
input_features,
input_nPlanes, // l
1, // beta
d_weight,
output_nPlanes // r
);
if (d_bias_) {
auto d_bias = THCTensor_(data)(state, d_bias_);
Convolution_bp_bias(d_output_features, d_bias, output_nPlanes,
output_nPlanes, nActive,
THCState_getCurrentStream(state));
}
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment