"googlemock/include/vscode:/vscode.git/clone" did not exist on "0599a7b8410dc5cfdb477900b280475ae775d7f9"
Commit 2c4ed608 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Goodbye THNN. Hello ATen!

parent 6d4475db
......@@ -6,7 +6,7 @@
#ifndef CPU_BATCHNORMALIZATION_H
#define CPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <vector>
// in/output_stride is normally the same as nPlanes; allow other values to act
......@@ -14,28 +14,28 @@
template <typename T>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
std::memset(saveMean, 0, nPlanes * sizeof(T));
std::memset(saveInvStd, 0, nPlanes * sizeof(T));
for (uInt row = 0, ci = 0; row < nActive;
for (Int row = 0, ci = 0; row < nActive;
row++, ci += input_stride - nPlanes) {
for (uInt plane = 0; plane < nPlanes; plane++, ci++) {
for (Int plane = 0; plane < nPlanes; plane++, ci++) {
saveMean[plane] += input_features[ci];
}
}
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
saveMean[plane] /= nActive;
runningMean[plane] =
momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
}
for (uInt row = 0, ci = 0; row < nActive;
for (Int row = 0, ci = 0; row < nActive;
row++, ci += input_stride - nPlanes) {
for (uInt plane = 0; plane < nPlanes; plane++, ci++) {
for (Int plane = 0; plane < nPlanes; plane++, ci++) {
saveInvStd[plane] +=
(input_features[ci] - saveMean[plane]) *
(input_features[ci] - saveMean[plane]); // accumulate sum-squares
......@@ -43,26 +43,26 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
// rooting
}
}
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * saveInvStd[plane] / (nActive - 1);
saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
}
} else {
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
saveMean[plane] = runningMean[plane];
saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
}
}
std::vector<T> w(nPlanes);
std::vector<T> b(nPlanes);
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
}
for (uInt row = 0, ci = 0, co = 0; row < nActive;
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T out = input_features[ci] * w[plane] + b[plane];
out = (out > 0) ? out : (out * leakiness);
output_features[co] = out;
......@@ -73,17 +73,17 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
template <typename T>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
std::vector<T> gradMean(nPlanes);
std::vector<T> dotp(nPlanes);
std::vector<T> k(nPlanes);
for (uInt row = 0, ci = 0, co = 0; row < nActive;
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
......@@ -91,15 +91,15 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
}
}
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
if (d_bias)
d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
gradMean[plane] /= nActive; // ...now
k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
}
for (uInt row = 0, ci = 0, co = 0; row < nActive;
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
d_input_features[ci] =
(d_output_features[co] - gradMean[plane] -
(input_features[ci] - saveMean[plane]) * k[plane]) *
......@@ -107,7 +107,7 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
}
}
if (d_weight)
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
d_weight[plane] = dotp[plane] * saveInvStd[plane];
}
}
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void cpu_BatchwiseMultiplicativeDropout_updateOutput(
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor noise, float alpha) {
output_features.resize_as_(input_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto nz = noise.data<T>();
for (Int row = 0; row < nActive; row++)
for (Int plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
plane++, o++, i++)
oF[o] = (iF[i] > 0) ? iF[i] * nz[plane] : iF[i] * nz[plane] * alpha;
}
template <typename T>
void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
/*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise,
float alpha) {
d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto nz = noise.data<T>();
for (Int row = 0; row < nActive; row++)
for (Int plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
plane++, o++, i++)
diF[i] = (iF[i] > 0) ? doF[o] * nz[plane] : doF[o] * nz[plane] * alpha;
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include <cstring>
template <typename T>
void rule_index_select(at::Tensor target, at::Tensor src, Int nRules,
Int *rules) {
auto t_ptr = target.data<T>();
auto s_ptr = src.data<T>();
auto n = target.size(1);
for (int i = 0; i < nRules; ++i)
std::memcpy(t_ptr + i * n, s_ptr + rules[2 * i] * n, sizeof(T) * n);
}
template <typename T>
void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules,
Int *rules) {
auto t_ptr = target.data<T>();
auto s_ptr = src.data<T>();
auto n = target.size(1);
for (int i = 0; i < nRules; ++i) {
auto t = t_ptr + rules[2 * i] * n;
auto s = s_ptr + i * n;
for (int j = 0; j < n; ++j)
t[j] += s[j];
}
}
template <typename T, Int Dimension>
double cpu_Convolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
flops += nRules * ip * op;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
}
}
return flops;
}
template <typename T, Int Dimension>
void cpu_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
auto w = weight.select(0, i);
auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto d_output_rows = d_output_features.type().tensor({nRules, op});
rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
}
}
}
template <typename T, Int Dimension>
double cpu_SubmanifoldConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
flops += nRules * ip * op;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
}
}
return flops;
}
template <typename T, Int Dimension>
void cpu_SubmanifoldConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor d_weight,
/*float*/ at::Tensor d_bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
auto w = weight.select(0, i);
auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto d_output_rows = d_output_features.type().tensor({nRules, op});
rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
}
}
}
template <typename T, Int Dimension>
double cpu_FullConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
flops += nRules * ip * op;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
}
}
return flops;
}
template <typename T, Int Dimension>
void cpu_FullConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor d_weight,
/*float*/ at::Tensor d_bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
auto w = weight.select(0, i);
auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto d_output_rows = d_output_features.type().tensor({nRules, op});
rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
}
}
}
template <typename T, Int Dimension>
double cpu_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
flops += nRules * ip * op;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
}
}
return flops;
}
template <typename T, Int Dimension>
void cpu_RandomizedStrideConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
auto w = weight.select(0, i);
auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
auto d_output_rows = d_output_features.type().tensor({nRules, op});
rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
}
}
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T, Int Dimension>
double cpu_Deconvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
flops += nRules * ip * op;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 0), output_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[1]);
auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[0]);
}
}
return flops;
}
template <typename T, Int Dimension>
void cpu_Deconvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
/*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
auto ip = weight.size(1);
auto op = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) {
auto r = _rules[i];
int nRules = r.size() / 2;
if (nRules) {
auto w = weight.select(0, i);
auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 0));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 1), d_input_rows);
auto input_rows = input_features.type().tensor({nRules, ip});
rule_index_select<T>(input_rows, input_features, nRules, &r[1]);
auto d_output_rows = d_output_features.type().tensor({nRules, op});
rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[0]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1]);
}
}
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template <typename T, Int Dimension>
void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features,
long batchSize, long mode) {
m.inputLayer(spatialSize, input_coords, batchSize, mode);
auto nPlanes = input_features.size(1);
auto &rules = m.inputLayerRuleBook;
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
InputLayer_ForwardPass<T>(input_features.data<T>(),
output_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
template <typename T, Int Dimension>
void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
auto nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
InputLayer_BackwardPass<T>(d_input_features.data<T>(),
d_output_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
template <typename T, Int Dimension>
void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
auto nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
InputLayer_BackwardPass<T>(output_features.data<T>(),
input_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
template <typename T, Int Dimension>
void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
auto nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
InputLayer_ForwardPass<T>(d_output_features.data<T>(),
d_input_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
template <typename T, Int Dimension>
void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
auto nPlanes = input_features.size(2);
auto &rules = m.blLayerRuleBook;
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
InputLayer_ForwardPass<T>(input_features.data<T>(),
output_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
template <typename T, Int Dimension>
void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
auto nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_();
InputLayer_BackwardPass<T>(d_input_features.data<T>(),
d_output_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], mode == 4);
}
}
template <typename T, Int Dimension>
void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features) {
auto &rules = m.blLayerRuleBook;
auto nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
InputLayer_BackwardPass<T>(output_features.data<T>(),
input_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
template <typename T, Int Dimension>
void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
auto nPlanes = d_output_features.size(2);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({nRows, nPlanes});
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
InputLayer_ForwardPass<T>(d_output_features.data<T>(),
d_input_features.data<T>(), nRows,
maxActive, nPlanes, &rules[1][0], false);
}
}
......@@ -6,21 +6,21 @@
#ifndef CPU_IOLAYERS_H
#define CPU_IOLAYERS_H
#include "../SparseConvNet.h"
#include <cstring>
// Assume output and d_input_features have been zero-ed
template <typename T>
void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
uInt maxActive, uInt nPlanes, uInt *rules,
void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
for (uInt row = 0; row < nRows; row++) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (uInt i = 1; i <= nActive; ++i) {
for (Int i = 1; i <= nActive; ++i) {
auto in_f = input_features + nPlanes * rules[i];
for (uInt plane = 0; plane < nPlanes; plane++) {
for (Int plane = 0; plane < nPlanes; plane++) {
output_features[plane] += multiplier * in_f[plane];
}
}
......@@ -30,14 +30,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
}
template <typename T>
void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
uInt nRows, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
for (uInt row = 0; row < nRows; row++) {
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (uInt i = 1; i <= nActive; ++i) {
for (Int i = 1; i <= nActive; ++i) {
auto d_in_f = d_input_features + nPlanes * rules[i];
for (uInt plane = 0; plane < nPlanes; plane++)
for (Int plane = 0; plane < nPlanes; plane++)
d_in_f[plane] += multiplier * d_output_features[plane];
}
d_output_features += nPlanes;
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features,
float alpha) {
output_features.resize_as_(input_features);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto n = input_features.numel();
for (Int i = 0; i < n; i++)
oF[i] = (iF[i] > 0) ? iF[i] : iF[i] * alpha;
}
template <typename T>
void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features,
float alpha) {
d_input_features.resize_as_(d_output_features);
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto n = d_input_features.numel();
for (Int i = 0; i < n; i++)
diF[i] = (iF[i] > 0) ? doF[i] : doF[i] * alpha;
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
template <typename T, Int Dimension>
void cpu_MaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
output_features.stride(0), &r[0], nHot);
}
}
template <typename T, Int Dimension>
void cpu_MaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
input_features.stride(0),
output_features.stride(0), &r[0], nHot);
}
}
template <typename T, Int Dimension>
void cpu_RandomizedStrideMaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
output_features.stride(0), &r[0], nHot);
}
}
template <typename T, Int Dimension>
void cpu_RandomizedStrideMaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
input_features.stride(0),
output_features.stride(0), &r[0], nHot);
}
}
......@@ -6,16 +6,16 @@
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template <typename T>
void MaxPooling_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite] * input_stride;
uInt o = rules[2 * outSite + 1] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] < input_features[i + plane])
output_features[o + plane] = input_features[i + plane];
}
......@@ -23,12 +23,12 @@ void MaxPooling_ForwardPass(T *input_features, T *output_features,
template <typename T>
void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite] * input_stride;
uInt o = rules[2 * outSite + 1] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
double cpu_NetworkInNetwork_updateOutput(/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features,
/*float*/ at::Tensor weight,
/*float*/ at::Tensor bias) {
auto nActive = input_features.size(0);
auto input_nPlanes = weight.size(0);
auto output_nPlanes = weight.size(1);
output_features.resize_({nActive, output_nPlanes});
if (bias.numel())
output_features.copy_(bias);
else
output_features.zero_();
output_features.addmm(input_features, weight);
return nActive * input_nPlanes * output_nPlanes;
}
template <typename T>
void cpu_NetworkInNetwork_updateGradInput(
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features,
/*float*/ at::Tensor weight) {
d_input_features.resize_({(int)d_output_features.size(0), weight.size(0)});
d_input_features.zero_();
at::mm_out(d_input_features, d_output_features, weight.t());
}
template <typename T>
void cpu_NetworkInNetwork_accGradParameters(
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_output_features,
/*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
auto nActive = input_features.size(0);
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
at::mm_out(d_weight, input_features.t(), d_output_features);
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template <typename T, Int Dimension>
void cpu_SparseToDense_updateOutput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, long nPlanes) {
{
std::array<long, Dimension + 2> sz;
sz[0] = m.grids.begin()->second.size(); // batch size
sz[1] = nPlanes;
long *in_sz = inputSize.data<long>();
for (Int i = 0; i < Dimension; ++i)
sz[i + 2] = in_sz[i];
output_features.resize_(sz);
output_features.zero_();
}
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
Int _nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
long spatialVolume = inputSize.prod().data<long>()[0];
for (auto &r : _rules) {
Int nHot = r.size() / 2;
SparseToDense_ForwardPass<T>(iF, oF, _nPlanes, spatialVolume, &r[0],
nHot);
oF += _nPlanes * spatialVolume;
}
}
}
template <typename T, Int Dimension>
void cpu_SparseToDense_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features) {
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
long spatialVolume = inputSize.prod().data<long>()[0];
Int _nPlanes = d_input_features.size(1);
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
SparseToDense_BackwardPass<T>(diF, doF, _nPlanes, spatialVolume, &r[0],
nHot);
doF += _nPlanes * spatialVolume;
}
}
}
......@@ -6,29 +6,29 @@
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *i = input_features + rules[2 * outSite] * nPlanes;
T *o = output_features + rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++)
for (Int plane = 0; plane < nPlanes; plane++)
o[plane * spatialVolume] = i[plane];
}
}
template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
T *d_o = d_output_features + rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++)
for (Int plane = 0; plane < nPlanes; plane++)
d_i[plane] = d_o[plane * spatialVolume];
}
}
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "UnPooling.h"
template <typename T, Int Dimension>
void cpu_UnPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
UnPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), &r[0], nHot);
}
}
template <typename T, Int Dimension>
void cpu_UnPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>() + nFeaturesToDrop;
auto doF = d_output_features.data<T>();
for (auto &r : _rules) {
Int nHot = r.size() / 2;
UnPooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), &r[0], nHot);
}
}
......@@ -6,27 +6,27 @@
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
#include "../SparseConvNet.h"
template <typename T>
void UnPooling_ForwardPass(T *input_features, T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt *rules,
uInt nHot, uInt filterVolume) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite + 1] * input_stride;
uInt o = rules[2 * outSite] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int *rules,
Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
output_features[o + plane] += input_features[i + plane];
}
}
template <typename T>
void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
for (uInt outSite = 0; outSite < nHot; outSite++) {
uInt i = rules[2 * outSite + 1] * input_stride;
uInt o = rules[2 * outSite] * output_stride;
for (uInt plane = 0; plane < nPlanes; plane++)
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
d_input_features[i + plane] += d_output_features[o + plane];
}
}
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "ActivePooling.h"
template <typename T, Int Dimension>
void cuda_ActivePooling_updateOutput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, bool average) {
Int nPlanes = input_features.size(1);
auto _rules = m.getActivePoolingRuleBook(inputSize);
Int batchSize = _rules[1][0];
Int maxActive = _rules[1][1];
output_features.resize_({batchSize, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
Int *rb = rulesBuffer.data<Int>();
Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
assert(rowBatchSize > 0);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
for (Int o = 0; o < batchSize; o += rowBatchSize) {
Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(Int) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_ForwardPass<T>(iF, oF + o * nPlanes, batchSize_, maxActive,
nPlanes, rb, average);
}
}
template <typename T, Int Dimension>
void cuda_ActivePooling_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, bool average) {
Int nPlanes = input_features.size(1);
auto _rules = m.getActivePoolingRuleBook(inputSize);
Int batchSize = _rules[1][0];
Int maxActive = _rules[1][1];
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
Int *rb = rulesBuffer.data<Int>();
Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
assert(rowBatchSize > 0);
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
for (Int o = 0; o < batchSize; o += rowBatchSize) {
Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
sizeof(Int) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice);
ActivePooling_BackwardPass<T>(diF, doF + o * nPlanes, batchSize_, maxActive,
nPlanes, rb, average);
}
}
......@@ -4,54 +4,52 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_ACTIVEPOOLING_H
#define GPU_ACTIVEPOOLING_H
#ifndef CUDA_ACTIVEPOOLING_H
#define CUDA_ACTIVEPOOLING_H
template <typename T>
__global__ void ActivePooling_fp(T *input_features, T *output_features,
uInt maxActive, uInt nPlanes, uInt *rules,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
T *out = &output_features[blockIdx.x * nPlanes];
uInt *r = &rules[blockIdx.x * (maxActive + 1)];
uInt nActive = *r++;
Int *r = &rules[blockIdx.x * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &input_features[(*r++) * nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
out[plane] += inp[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
uInt batchSize, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
ActivePooling_fp<T> << <batchSize, kernelBlockDim, 0,
THCState_getCurrentStream(state)>>>
(input_features, output_features, maxActive, nPlanes, rules, average);
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
Int kernelBlockDim = std::min(nPlanes, (Int)32);
ActivePooling_fp<T><<<batchSize, kernelBlockDim>>>(
input_features, output_features, maxActive, nPlanes, rules, average);
}
template <typename T>
__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
uInt maxActive, uInt nPlanes, uInt *rules,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
T *out = &d_output_features[blockIdx.x * nPlanes];
uInt *r = &rules[blockIdx.x * (maxActive + 1)];
uInt nActive = *r++;
Int *r = &rules[blockIdx.x * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
inp[plane] = out[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
uInt batchSize, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
ActivePooling_bp<T> << <batchSize, kernelBlockDim, 0,
THCState_getCurrentStream(state)>>>
(d_input_features, d_output_features, maxActive, nPlanes, rules, average);
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
Int kernelBlockDim = std::min(nPlanes, (Int)32);
ActivePooling_bp<T><<<batchSize, kernelBlockDim>>>(
d_input_features, d_output_features, maxActive, nPlanes, rules, average);
}
#endif /* GPU_ActivePOOLING_H */
#endif /* CUDA_ActivePOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h"
template <typename T>
double cuda_AffineReluTrivialConvolution_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor affineWeight,
/*cuda float*/ at::Tensor affineBias,
/*cuda float*/ at::Tensor convWeight) {
output_features.resize_({input_features.size(0), convWeight.size(1)});
dAffineReluTrivialConvolution_forward<T>(
input_features.data<T>(), output_features.data<T>(),
affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
convWeight.size(0), input_features.stride(0), convWeight.size(1),
output_features.size(1), input_features.size(0));
return input_features.size(0) * input_features.size(1) *
output_features.size(1);
}
template <typename T>
void cuda_AffineReluTrivialConvolution_backward(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor affineWeight,
/*cuda float*/ at::Tensor d_affineWeight,
/*cuda float*/ at::Tensor affineBias,
/*cuda float*/ at::Tensor d_affineBias,
/*cuda float*/ at::Tensor convWeight,
/*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
d_input_features.resize_as_(input_features);
dAffineReluTrivialConvolution_backward_dW<T>(
input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), affineWeight.data<T>(),
d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
input_features.size(0), additiveGrad);
}
......@@ -4,18 +4,18 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardA(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive) {
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// nActive must be a multiple of K!!
// Input x Weight -> Output
......@@ -24,9 +24,9 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
......@@ -35,7 +35,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
......@@ -52,7 +52,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -82,20 +82,20 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
inFeatures += K;
}
}
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardB(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive) {
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
......@@ -104,7 +104,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
......@@ -121,7 +121,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -158,20 +158,19 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>> \
(inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o); \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
......@@ -179,10 +178,10 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
template <typename T>
void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
T *affineWeight, T *affineBias,
T *convWeight, uInt input_nPlanes,
uInt input_stride,
uInt output_nPlanes,
uInt output_stride, uInt nActive) {
T *convWeight, Int input_nPlanes,
Int input_stride,
Int output_nPlanes,
Int output_stride, Int nActive) {
FOO(T, 64, 16)
FOO(T, 32, 8)
......@@ -193,8 +192,8 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
template <>
void dAffineReluTrivialConvolution_forward<double>(
double *inFeatures, double *outFeatures, double *affineWeight,
double *affineBias, double *convWeight, uInt input_nPlanes,
uInt input_stride, uInt output_nPlanes, uInt output_stride, uInt nActive) {
double *affineBias, double *convWeight, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int output_stride, Int nActive) {
FOO(double, 32, 8)
FOO(double, 16, 4)
......@@ -206,15 +205,15 @@ void dAffineReluTrivialConvolution_forward<double>(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
......@@ -234,7 +233,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
......@@ -253,7 +252,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
}
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
......@@ -303,15 +302,15 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
......@@ -331,7 +330,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const uInt tx = threadIdx.x;
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
......@@ -350,7 +349,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
}
__syncthreads();
for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
......@@ -406,20 +405,19 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>> \
(inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
......@@ -432,8 +430,8 @@ template <typename T>
void dAffineReluTrivialConvolution_backward_dW(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) {
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h"
template <typename T, Int Dimension>
void cuda_AveragePooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
RULEBOOKITERATOR(cuda_AveragePooling_ForwardPass<T>(
iF, oF, nPlanes, input_features.size(1),
output_features.size(1), rbB, nHotB, _rules.size());
, )
}
template <typename T, Int Dimension>
void cuda_AveragePooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>() + nFeaturesToDrop;
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(cuda_AveragePooling_BackwardPass<T>(
diF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), rbB, nHotB, _rules.size());
, )
}
......@@ -4,27 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AVERAGEPOOLING_H
#define GPU_AVERAGEPOOLING_H
#ifndef CUDA_AVERAGEPOOLING_H
#define CUDA_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void AveragePooling_fp(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
T alpha) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
atomicAdd(&output_features[o + plane],
alpha * input_features[i + plane]);
}
......@@ -33,31 +33,31 @@ __global__ void AveragePooling_fp(T *input_features, T *output_features,
}
template <typename T>
void AveragePooling_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
T alpha) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += alpha * d_output_features[o + plane];
}
__syncthreads();
......@@ -65,12 +65,12 @@ __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
}
template <typename T>
void AveragePooling_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules,
Int nHot, Int filterVolume) {
AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
#endif /* GPU_AVERAGEPOOLING_H */
#endif /* CUDA_AVERAGEPOOLING_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment