Commit de3743f6 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Factor out CUDA code

parent f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise,
Int nActive, Int nPlanes,
Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
__global__ void
BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
T *d_output_features, T *noise, Int nActive,
Int nPlanes, Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS);
template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T, Int Dimension>
double cuda_Convolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
template <typename T, Int Dimension>
double cuda_SubmanifoldConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
if (nActive) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActive, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActive);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_SubmanifoldConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
if (nActive) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActive, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_FullConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActiveOut = mOut.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_FullConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActiveIn = mIn.getNActive(inputSize);
Int nActiveOut = mOut.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
template <typename T, Int Dimension>
double cuda_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
This diff is collapsed.
This diff is collapsed.
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride);
template <typename T, Int Dimension>
double cuda_Deconvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_Deconvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
This diff is collapsed.
This diff is collapsed.
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void InputLayer_fp(T *input_features, T *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
bool average);
template <typename T>
void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
bool average);
template <typename T, Int Dimension>
void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long batchSize, long mode) {
m.inputLayer(spatialSize, input_coords, batchSize, mode);
Int nPlanes = input_features.size(1);
auto &rules = m.inputLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_InputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
false);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
Int nPlanes = input_features.size(2);
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto &rules = m.blLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(1);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateOutput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(2);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({nRows, nPlanes});
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
false);
}
}
This diff is collapsed.
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_IOLAYERS_H
#define CUDA_IOLAYERS_H
template <typename T>
__global__ void InputLayer_fp(T *input_features, T *output_features,
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = output_features + row * nPlanes;
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = input_features + r[i] * nPlanes;
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
out[plane] += multiplier * inp[plane];
}
}
}
template <typename T>
__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = d_output_features + row * nPlanes;
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = d_input_features + r[i] * nPlanes;
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
atomicAdd(&inp[plane], multiplier * out[plane]);
}
}
}
#endif /* CUDA_IOLAYERS_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha);
template <typename T>
void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
Int n, T alpha);
template <typename T>
void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
T alpha) {
output_features.resize_as_(input_features);
auto n = input_features.numel();
LeakyReLU_fp<T>(input_features.data<T>(), output_features.data<T>(), n,
alpha);
}
template <typename T>
void cuda_LeakyReLU_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, T alpha) {
d_input_features.resize_as_(d_output_features);
auto n = d_input_features.numel();
LeakyReLU_bp<T>(input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), n, alpha);
}
...@@ -4,26 +4,28 @@ ...@@ -4,26 +4,28 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template <typename T> template <typename T>
void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features, __global__ void LeakyReLU_fp_(T *input_features, T *output_features, Int n,
/*cuda float*/ at::Tensor output_features, T alpha) {
float alpha) { for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features.resize_as_(input_features); output_features[i] = (input_features[i] > 0) ? input_features[i]
auto n = input_features.numel(); : (input_features[i] * alpha);
LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(), }
output_features.data<T>(), n, alpha); template <typename T>
void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha) {
LeakyReLU_fp_<T><<<16, 1024>>>(input_features, output_features, n, alpha);
}
template <typename T>
__global__ void LeakyReLU_bp_(T *input_features, T *d_input_features,
T *d_output_features, Int n, T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
} }
template <typename T> template <typename T>
void cuda_LeakyReLU_updateGradInput( void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
/*cuda float*/ at::Tensor input_features, Int n, T alpha) {
/*cuda float*/ at::Tensor d_input_features, LeakyReLU_bp_<T><<<16, 1024>>>(input_features, d_input_features,
/*cuda float*/ at::Tensor d_output_features, float alpha) { output_features, n, alpha);
d_input_features.resize_as_(d_output_features);
auto n = d_input_features.numel();
LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
d_input_features.data<T>(),
d_output_features.data<T>(), n, alpha);
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template <typename T>
__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features[i] = (input_features[i] > 0) ? input_features[i]
: (input_features[i] * alpha);
}
template <typename T>
__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
T *d_output_features, Int n, T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
}
#endif
This diff is collapsed.
This diff is collapsed.
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_MAXPOOLING_H
#define CUDA_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_fp(T *input_features, T *output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
T inp = input_features[i + plane];
if (output_features[o + plane] < inp)
output_features[o + plane] = inp;
}
}
__syncthreads();
}
}
template <typename T>
void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rules, nHot);
}
#endif /* CUDA_MAXPOOLING_H */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment