Commit de3743f6 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Factor out CUDA code

parent f0407b36
SparseConvNetTorch/build/ SparseConvNetTorch/build/
*.t7 *.pth
t7/
*.o *.o
*.a *.a
*.so *.so
...@@ -11,3 +10,5 @@ pickle ...@@ -11,3 +10,5 @@ pickle
PyTorch/sparseconvnet.egg-info/ PyTorch/sparseconvnet.egg-info/
PyTorch/sparseconvnet/SCN/__init__.py PyTorch/sparseconvnet/SCN/__init__.py
sparseconvnet.egg-info sparseconvnet.egg-info
*.zip
*.rar
#!/bin/bash #!/bin/bash
rm -rf build/ sparseconvnet.egg-info sparseconvnet_SCN*.so rm -rf build/ dist/ sparseconvnet.egg-info sparseconvnet_SCN*.so
python setup.py install python setup.py install
...@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1 ...@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1
p['lr_decay'] = 4e-2 p['lr_decay'] = 4e-2
p['weight_decay'] = 1e-4 p['weight_decay'] = 1e-4
p['momentum'] = 0.9 p['momentum'] = 0.9
p['check_point'] = True p['check_point'] = False
p['use_cuda'] = torch.cuda.is_available() p['use_cuda'] = torch.cuda.is_available()
dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor' dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor' dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'
......
...@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1 ...@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1
p['lr_decay'] = 4e-2 p['lr_decay'] = 4e-2
p['weight_decay'] = 1e-4 p['weight_decay'] = 1e-4
p['momentum'] = 0.9 p['momentum'] = 0.9
p['check_point'] = True p['check_point'] = False
p['use_cuda'] = torch.cuda.is_available() p['use_cuda'] = torch.cuda.is_available()
dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor' dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor' dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'
......
...@@ -4,8 +4,7 @@ ...@@ -4,8 +4,7 @@
# This source code is licensed under the license found in the # This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
import torch import torch, torch.utils.data
import torchnet
import sparseconvnet as scn import sparseconvnet as scn
import pickle import pickle
import math import math
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
# This source code is licensed under the license found in the # This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
#!/bin/bash #!/bin/bash
set -e
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00208/Online%20Handwritten%20Assamese%20Characters%20Dataset.rar wget https://archive.ics.uci.edu/ml/machine-learning-databases/00208/Online%20Handwritten%20Assamese%20Characters%20Dataset.rar
unrar e -cl -y "Online Handwritten Assamese Characters Dataset.rar" unrar e -cl -y "Online Handwritten Assamese Characters Dataset.rar"
mkdir tmp mkdir tmp
......
...@@ -24,12 +24,13 @@ setup( ...@@ -24,12 +24,13 @@ setup(
packages=['sparseconvnet','sparseconvnet.SCN'], packages=['sparseconvnet','sparseconvnet.SCN'],
ext_modules=[ ext_modules=[
CUDAExtension('sparseconvnet_SCN', CUDAExtension('sparseconvnet_SCN',
['sparseconvnet/SCN/pybind_cuda.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp', 'sparseconvnet/SCN/instantiate_cuda.cu'], [
'sparseconvnet/SCN/cuda.cu', 'sparseconvnet/SCN/sparseconvnet_cuda.cpp', 'sparseconvnet/SCN/pybind.cpp'],
include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'], include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
extra_compile_args=extra) extra_compile_args=extra)
if torch.cuda.is_available() else if torch.cuda.is_available() else
CppExtension('sparseconvnet_SCN', CppExtension('sparseconvnet_SCN',
['sparseconvnet/SCN/pybind_cpu.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp'], ['sparseconvnet/SCN/pybind.cpp', 'sparseconvnet/SCN/sparseconvnet_cpu.cpp'],
include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'], include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
extra_compile_args=extra['cxx'])], extra_compile_args=extra['cxx'])],
cmdclass={'build_ext': BuildExtension}, cmdclass={'build_ext': BuildExtension},
......
...@@ -4,7 +4,39 @@ ...@@ -4,7 +4,39 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "ActivePooling.h" // Assume output_features and d_input_features have been zero-ed
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
Int batchSize, Int maxActive, Int nPlanes,
RuleBook &rules, bool average) {
for (Int outSite = 0; outSite < batchSize; outSite++) {
T *out = &output_features[outSite * nPlanes];
Int *r = &rules[0][outSite * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &input_features[(*r++) * nPlanes];
for (Int plane = 0; plane < nPlanes; plane++)
out[plane] += inp[plane] * multiplier;
}
}
}
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int batchSize, Int maxActive, Int nPlanes,
RuleBook &rules, bool average) {
for (Int outSite = 0; outSite < batchSize; outSite++) {
T *out = &d_output_features[outSite * nPlanes];
Int *r = &rules[0][outSite * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (Int plane = 0; plane < nPlanes; plane++)
inp[plane] = out[plane] * multiplier;
}
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_ActivePooling_updateOutput( void cpu_ActivePooling_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_ACTIVEPOOLING_H
#define CPU_ACTIVEPOOLING_H
// Assume output_features and d_input_features have been zero-ed
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
Int batchSize, Int maxActive, Int nPlanes,
RuleBook &rules, bool average) {
for (Int outSite = 0; outSite < batchSize; outSite++) {
T *out = &output_features[outSite * nPlanes];
Int *r = &rules[0][outSite * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &input_features[(*r++) * nPlanes];
for (Int plane = 0; plane < nPlanes; plane++)
out[plane] += inp[plane] * multiplier;
}
}
}
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int batchSize, Int maxActive, Int nPlanes,
RuleBook &rules, bool average) {
for (Int outSite = 0; outSite < batchSize; outSite++) {
T *out = &d_output_features[outSite * nPlanes];
Int *r = &rules[0][outSite * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (Int plane = 0; plane < nPlanes; plane++)
inp[plane] = out[plane] * multiplier;
}
}
}
#endif /* CPU_ACTIVEPOOLING_H */
...@@ -4,7 +4,68 @@ ...@@ -4,7 +4,68 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h" #include <cstring>
template <typename T>
void AffineReluTrivialConvolution_ForwardPass(
T *input_features, Int input_nPlanes, Int input_stride, T *output_features,
Int output_nPlanes, Int output_stride, T *affineWeight, T *affineBias,
T *convWeight, Int nActive) {
for (Int row = 0; row < nActive; row++) {
for (Int column = 0; column < output_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < input_nPlanes; j++) {
T i = input_features[row * input_stride + j] * affineWeight[j] +
affineBias[j];
i = (i > 0) ? i : 0;
sum += i * convWeight[j * output_nPlanes + column];
}
output_features[row * output_stride + column] = sum;
}
}
}
template <typename T>
void AffineReluTrivialConvolution_BackwardPass(
T *input_features, T *d_input_features, Int input_nPlanes, Int input_stride,
T *d_output_features, Int output_nPlanes, Int output_stride,
T *affineWeight, T *dAffineWeight, T *affineBias, T *dAffineBias,
T *convWeight, T *dConvWeight, Int nActive, bool additiveGrad) {
for (Int row = 0; row < input_nPlanes; row++) {
for (Int column = 0; column < output_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < nActive; j++) {
T i = input_features[j * input_stride + row] * affineWeight[row] +
affineBias[row];
i = (i > 0) ? i : 0;
sum += i * d_output_features[j * output_stride + column];
}
dConvWeight[row * output_nPlanes + column] += sum;
}
}
for (Int row = 0; row < nActive; row++) {
for (Int column = 0; column < input_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < output_nPlanes; j++) {
sum += d_output_features[row * output_stride + j] *
convWeight[column * output_nPlanes + j];
}
T i = input_features[row * input_stride + column] * affineWeight[column] +
affineBias[column];
if (i <= 0) // d_ReLU
sum = 0;
dAffineWeight[column] += sum * i;
dAffineBias[column] += sum;
sum *= affineWeight[column];
if (additiveGrad)
d_input_features[row * input_stride + column] += sum;
else
d_input_features[row * input_stride + column] = sum;
}
}
}
template <typename T> template <typename T>
double cpu_AffineReluTrivialConvolution_updateOutput( double cpu_AffineReluTrivialConvolution_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_AffineReluTrivialConvolution_H
#define CPU_AffineReluTrivialConvolution_H
#include <cstring>
template <typename T>
void AffineReluTrivialConvolution_ForwardPass(
T *input_features, Int input_nPlanes, Int input_stride, T *output_features,
Int output_nPlanes, Int output_stride, T *affineWeight, T *affineBias,
T *convWeight, Int nActive) {
for (Int row = 0; row < nActive; row++) {
for (Int column = 0; column < output_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < input_nPlanes; j++) {
T i = input_features[row * input_stride + j] * affineWeight[j] +
affineBias[j];
i = (i > 0) ? i : 0;
sum += i * convWeight[j * output_nPlanes + column];
}
output_features[row * output_stride + column] = sum;
}
}
}
template <typename T>
void AffineReluTrivialConvolution_BackwardPass(
T *input_features, T *d_input_features, Int input_nPlanes, Int input_stride,
T *d_output_features, Int output_nPlanes, Int output_stride,
T *affineWeight, T *dAffineWeight, T *affineBias, T *dAffineBias,
T *convWeight, T *dConvWeight, Int nActive, bool additiveGrad) {
for (Int row = 0; row < input_nPlanes; row++) {
for (Int column = 0; column < output_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < nActive; j++) {
T i = input_features[j * input_stride + row] * affineWeight[row] +
affineBias[row];
i = (i > 0) ? i : 0;
sum += i * d_output_features[j * output_stride + column];
}
dConvWeight[row * output_nPlanes + column] += sum;
}
}
for (Int row = 0; row < nActive; row++) {
for (Int column = 0; column < input_nPlanes; column++) {
T sum = 0;
for (Int j = 0; j < output_nPlanes; j++) {
sum += d_output_features[row * output_stride + j] *
convWeight[column * output_nPlanes + j];
}
T i = input_features[row * input_stride + column] * affineWeight[column] +
affineBias[column];
if (i <= 0) // d_ReLU
sum = 0;
dAffineWeight[column] += sum * i;
dAffineBias[column] += sum;
sum *= affineWeight[column];
if (additiveGrad)
d_input_features[row * input_stride + column] += sum;
else
d_input_features[row * input_stride + column] = sum;
}
}
}
#endif /* CPU_AffineReluTrivialConvolution_H */
...@@ -4,7 +4,31 @@ ...@@ -4,7 +4,31 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "AveragePooling.h" template <typename T>
void AveragePooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
output_features[o + plane] += input_features[i + plane] / filterVolume;
}
}
template <typename T>
void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
d_input_features[i + plane] +=
d_output_features[o + plane] / filterVolume;
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_AveragePooling_updateOutput( void cpu_AveragePooling_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_AVERAGEPOOLING_H
#define CPU_AVERAGEPOOLING_H
template <typename T>
void AveragePooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
output_features[o + plane] += input_features[i + plane] / filterVolume;
}
}
template <typename T>
void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
d_input_features[i + plane] +=
d_output_features[o + plane] / filterVolume;
}
}
#endif /* CPU_AVERAGEPOOLING_H */
...@@ -4,46 +4,125 @@ ...@@ -4,46 +4,125 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h" #include <vector>
// in/output_stride is normally the same as nPlanes; allow other values to act
// on a subset of columns, i.e. an inplace DenseNet blocks
template <typename T> template <typename T>
void cpu_BatchNormalization_updateOutput( void BatchNormalization_ForwardPass(T *input_features, T *output_features,
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features, Int nPlanes, Int input_stride,
/*float*/ at::Tensor saveMean, Int output_stride, Int nActive, T *saveMean,
/*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean, T *saveInvStd, T *runningMean,
/*float*/ at::Tensor runningVar, T *runningVar, T *weight, T *bias, T eps,
/*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum, T momentum, bool train, T leakiness) {
bool train, T leakiness) { if (train) {
output_features.resize_as_(input_features); std::memset(saveMean, 0, nPlanes * sizeof(T));
if (input_features.ndimension() == 2) { std::memset(saveInvStd, 0, nPlanes * sizeof(T));
auto nActive = input_features.size(0); for (Int row = 0, ci = 0; row < nActive;
auto nPlanes = input_features.size(1); row++, ci += input_stride - nPlanes) {
auto input_stride = input_features.stride(0); for (Int plane = 0; plane < nPlanes; plane++, ci++) {
auto output_stride = output_features.stride(0); saveMean[plane] += input_features[ci];
BatchNormalization_ForwardPass<T>( }
input_features.data<T>(), output_features.data<T>(), nPlanes, }
input_stride, output_stride, nActive, saveMean.data<T>(), for (Int plane = 0; plane < nPlanes; plane++) {
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), saveMean[plane] /= nActive;
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, runningMean[plane] =
momentum, train, leakiness); momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
}
for (Int row = 0, ci = 0; row < nActive;
row++, ci += input_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++) {
saveInvStd[plane] +=
(input_features[ci] - saveMean[plane]) *
(input_features[ci] - saveMean[plane]); // accumulate sum-squares
// before inverse square
// rooting
}
}
for (Int plane = 0; plane < nPlanes; plane++) {
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * saveInvStd[plane] / (nActive - 1);
saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
}
} else {
for (Int plane = 0; plane < nPlanes; plane++) {
saveMean[plane] = runningMean[plane];
saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
}
}
std::vector<T> w(nPlanes);
std::vector<T> b(nPlanes);
for (Int plane = 0; plane < nPlanes; plane++) {
w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
}
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T out = input_features[ci] * w[plane] + b[plane];
out = (out > 0) ? out : (out * leakiness);
output_features[co] = out;
}
}
}
template <typename T>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
std::vector<T> gradMean(nPlanes);
std::vector<T> dotp(nPlanes);
std::vector<T> k(nPlanes);
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
gradMean[plane] += d;
dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
}
}
for (Int plane = 0; plane < nPlanes; plane++) {
if (d_bias)
d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
gradMean[plane] /= nActive; // ...now
k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
}
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
d_input_features[ci] =
(d_output_features[co] - gradMean[plane] -
(input_features[ci] - saveMean[plane]) * k[plane]) *
saveInvStd[plane] * (weight ? weight[plane] : 1);
}
} }
if (d_weight)
for (Int plane = 0; plane < nPlanes; plane++) {
d_weight[plane] = dotp[plane] * saveInvStd[plane];
}
} }
template <typename T> template <typename T>
void cpu_BatchNormalizationInTensor_updateOutput( void cpu_BatchNormalization_updateOutput(
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features, /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor saveMean, /*float*/ at::Tensor saveMean,
/*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean, /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
/*float*/ at::Tensor runningVar, /*float*/ at::Tensor runningVar,
/*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum, /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
bool train, T leakiness) { bool train, T leakiness) {
output_features.resize_as_(input_features);
if (input_features.ndimension() == 2) { if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0); auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1); auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0); auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0); auto output_stride = output_features.stride(0);
BatchNormalization_ForwardPass<T>( BatchNormalization_ForwardPass<T>(
input_features.data<T>(), output_features.data<T>(), nPlanes, input_features.data<T>(), output_features.data<T>(), nPlanes,
input_stride, output_stride, nActive, saveMean.data<T>(), input_stride, output_stride, nActive, saveMean.data<T>(),
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_BATCHNORMALIZATION_H
#define CPU_BATCHNORMALIZATION_H
#include <vector>
// in/output_stride is normally the same as nPlanes; allow other values to act
// on a subset of columns, i.e. an inplace DenseNet blocks
template <typename T>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
std::memset(saveMean, 0, nPlanes * sizeof(T));
std::memset(saveInvStd, 0, nPlanes * sizeof(T));
for (Int row = 0, ci = 0; row < nActive;
row++, ci += input_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++) {
saveMean[plane] += input_features[ci];
}
}
for (Int plane = 0; plane < nPlanes; plane++) {
saveMean[plane] /= nActive;
runningMean[plane] =
momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
}
for (Int row = 0, ci = 0; row < nActive;
row++, ci += input_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++) {
saveInvStd[plane] +=
(input_features[ci] - saveMean[plane]) *
(input_features[ci] - saveMean[plane]); // accumulate sum-squares
// before inverse square
// rooting
}
}
for (Int plane = 0; plane < nPlanes; plane++) {
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * saveInvStd[plane] / (nActive - 1);
saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
}
} else {
for (Int plane = 0; plane < nPlanes; plane++) {
saveMean[plane] = runningMean[plane];
saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
}
}
std::vector<T> w(nPlanes);
std::vector<T> b(nPlanes);
for (Int plane = 0; plane < nPlanes; plane++) {
w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
}
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T out = input_features[ci] * w[plane] + b[plane];
out = (out > 0) ? out : (out * leakiness);
output_features[co] = out;
}
}
}
template <typename T>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
std::vector<T> gradMean(nPlanes);
std::vector<T> dotp(nPlanes);
std::vector<T> k(nPlanes);
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
gradMean[plane] += d;
dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
}
}
for (Int plane = 0; plane < nPlanes; plane++) {
if (d_bias)
d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
gradMean[plane] /= nActive; // ...now
k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
}
for (Int row = 0, ci = 0, co = 0; row < nActive;
row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
d_input_features[ci] =
(d_output_features[co] - gradMean[plane] -
(input_features[ci] - saveMean[plane]) * k[plane]) *
saveInvStd[plane] * (weight ? weight[plane] : 1);
}
}
if (d_weight)
for (Int plane = 0; plane < nPlanes; plane++) {
d_weight[plane] = dotp[plane] * saveInvStd[plane];
}
}
#endif /* CPU_BATCHNORMALIZATION_H */
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
template <typename T> template <typename T>
void cpu_BatchwiseMultiplicativeDropout_updateOutput( void cpu_BatchwiseMultiplicativeDropout_updateOutput(
/*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features, /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
/*float*/ at::Tensor noise, float alpha) { /*float*/ at::Tensor noise, T alpha) {
output_features.resize_as_(input_features); output_features.resize_as_(input_features);
auto nActive = input_features.size(0); auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1); auto nPlanes = input_features.size(1);
...@@ -23,7 +23,7 @@ template <typename T> ...@@ -23,7 +23,7 @@ template <typename T>
void cpu_BatchwiseMultiplicativeDropout_updateGradInput( void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
/*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features, /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise, /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise,
float alpha) { T alpha) {
d_input_features.resize_as_(d_output_features); d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0); auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1); auto nPlanes = input_features.size(1);
......
...@@ -11,7 +11,9 @@ void rule_index_select(at::Tensor target, at::Tensor src, Int nRules, ...@@ -11,7 +11,9 @@ void rule_index_select(at::Tensor target, at::Tensor src, Int nRules,
auto t_ptr = target.data<T>(); auto t_ptr = target.data<T>();
auto s_ptr = src.data<T>(); auto s_ptr = src.data<T>();
auto n = target.size(1); auto n = target.size(1);
for (int i = 0; i < nRules; ++i) Int i;
#pragma omp parallel for private(i)
for (i = 0; i < nRules; ++i)
std::memcpy(t_ptr + i * n, s_ptr + rules[2 * i] * n, sizeof(T) * n); std::memcpy(t_ptr + i * n, s_ptr + rules[2 * i] * n, sizeof(T) * n);
} }
template <typename T> template <typename T>
...@@ -20,7 +22,9 @@ void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules, ...@@ -20,7 +22,9 @@ void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules,
auto t_ptr = target.data<T>(); auto t_ptr = target.data<T>();
auto s_ptr = src.data<T>(); auto s_ptr = src.data<T>();
auto n = target.size(1); auto n = target.size(1);
for (int i = 0; i < nRules; ++i) { Int i;
#pragma omp parallel for private(i)
for (i = 0; i < nRules; ++i) {
auto t = t_ptr + rules[2 * i] * n; auto t = t_ptr + rules[2 * i] * n;
auto s = s_ptr + i * n; auto s = s_ptr + i * n;
for (int j = 0; j < n; ++j) for (int j = 0; j < n; ++j)
......
...@@ -4,7 +4,43 @@ ...@@ -4,7 +4,43 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "IOLayers.h" #include <cstring>
// Assume output and d_input_features have been zero-ed
template <typename T>
void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (Int i = 1; i <= nActive; ++i) {
auto in_f = input_features + nPlanes * rules[i];
for (Int plane = 0; plane < nPlanes; plane++) {
output_features[plane] += multiplier * in_f[plane];
}
}
output_features += nPlanes;
rules += 1 + maxActive;
}
}
template <typename T>
void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
Int nRows, Int maxActive, Int nPlanes, Int *rules,
bool average) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (Int i = 1; i <= nActive; ++i) {
auto d_in_f = d_input_features + nPlanes * rules[i];
for (Int plane = 0; plane < nPlanes; plane++)
d_in_f[plane] += multiplier * d_output_features[plane];
}
d_output_features += nPlanes;
rules += 1 + maxActive;
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_InputLayer_updateOutput(Metadata<Dimension> &m, void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
...@@ -26,8 +62,8 @@ void cpu_InputLayer_updateOutput(Metadata<Dimension> &m, ...@@ -26,8 +62,8 @@ void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
output_features.resize_({*m.inputNActive, nPlanes}); output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_(); output_features.zero_();
InputLayer_ForwardPass<T>(input_features.data<T>(), InputLayer_ForwardPass<T>(input_features.data<T>(),
output_features.data<T>(), nRows, output_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], mode == 4); nPlanes, &rules[1][0], mode == 4);
} }
} }
template <typename T, Int Dimension> template <typename T, Int Dimension>
...@@ -47,8 +83,8 @@ void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m, ...@@ -47,8 +83,8 @@ void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
d_input_features.resize_({rules[0][2], nPlanes}); d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_(); d_input_features.zero_();
InputLayer_BackwardPass<T>(d_input_features.data<T>(), InputLayer_BackwardPass<T>(d_input_features.data<T>(),
d_output_features.data<T>(), nRows, d_output_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], mode == 4); nPlanes, &rules[1][0], mode == 4);
} }
} }
...@@ -69,8 +105,8 @@ void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m, ...@@ -69,8 +105,8 @@ void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
output_features.resize_({rules[0][2], nPlanes}); output_features.resize_({rules[0][2], nPlanes});
output_features.zero_(); output_features.zero_();
InputLayer_BackwardPass<T>(output_features.data<T>(), InputLayer_BackwardPass<T>(output_features.data<T>(),
input_features.data<T>(), nRows, input_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], false); nPlanes, &rules[1][0], false);
} }
} }
template <typename T, Int Dimension> template <typename T, Int Dimension>
...@@ -90,8 +126,8 @@ void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m, ...@@ -90,8 +126,8 @@ void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
d_input_features.resize_({nRows, nPlanes}); d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_(); d_input_features.zero_();
InputLayer_ForwardPass<T>(d_output_features.data<T>(), InputLayer_ForwardPass<T>(d_output_features.data<T>(),
d_input_features.data<T>(), nRows, d_input_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], false); nPlanes, &rules[1][0], false);
} }
} }
...@@ -116,8 +152,8 @@ void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m, ...@@ -116,8 +152,8 @@ void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
output_features.resize_({*m.inputNActive, nPlanes}); output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_(); output_features.zero_();
InputLayer_ForwardPass<T>(input_features.data<T>(), InputLayer_ForwardPass<T>(input_features.data<T>(),
output_features.data<T>(), nRows, output_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], mode == 4); nPlanes, &rules[1][0], mode == 4);
} }
} }
template <typename T, Int Dimension> template <typename T, Int Dimension>
...@@ -139,8 +175,8 @@ void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m, ...@@ -139,8 +175,8 @@ void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes}); d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_(); d_input_features.zero_();
InputLayer_BackwardPass<T>(d_input_features.data<T>(), InputLayer_BackwardPass<T>(d_input_features.data<T>(),
d_output_features.data<T>(), nRows, d_output_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], mode == 4); nPlanes, &rules[1][0], mode == 4);
} }
} }
...@@ -162,8 +198,8 @@ void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m, ...@@ -162,8 +198,8 @@ void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
output_features.resize_({rules[0][2], rules[0][3], nPlanes}); output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_(); output_features.zero_();
InputLayer_BackwardPass<T>(output_features.data<T>(), InputLayer_BackwardPass<T>(output_features.data<T>(),
input_features.data<T>(), nRows, input_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], false); nPlanes, &rules[1][0], false);
} }
} }
template <typename T, Int Dimension> template <typename T, Int Dimension>
...@@ -184,7 +220,7 @@ void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m, ...@@ -184,7 +220,7 @@ void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
d_input_features.resize_({nRows, nPlanes}); d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_(); d_input_features.zero_();
InputLayer_ForwardPass<T>(d_output_features.data<T>(), InputLayer_ForwardPass<T>(d_output_features.data<T>(),
d_input_features.data<T>(), nRows, d_input_features.data<T>(), nRows, maxActive,
maxActive, nPlanes, &rules[1][0], false); nPlanes, &rules[1][0], false);
} }
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_IOLAYERS_H
#define CPU_IOLAYERS_H
#include <cstring>
// Assume output and d_input_features have been zero-ed
template <typename T>
void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (Int i = 1; i <= nActive; ++i) {
auto in_f = input_features + nPlanes * rules[i];
for (Int plane = 0; plane < nPlanes; plane++) {
output_features[plane] += multiplier * in_f[plane];
}
}
output_features += nPlanes;
rules += 1 + maxActive;
}
}
template <typename T>
void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (Int row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (Int i = 1; i <= nActive; ++i) {
auto d_in_f = d_input_features + nPlanes * rules[i];
for (Int plane = 0; plane < nPlanes; plane++)
d_in_f[plane] += multiplier * d_output_features[plane];
}
d_output_features += nPlanes;
rules += 1 + maxActive;
}
}
#endif /* CPU_IOLAYERS_H */
...@@ -6,8 +6,7 @@ ...@@ -6,8 +6,7 @@
template <typename T> template <typename T>
void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features, void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
/*float*/ at::Tensor output_features, /*float*/ at::Tensor output_features, T alpha) {
float alpha) {
output_features.resize_as_(input_features); output_features.resize_as_(input_features);
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
auto oF = output_features.data<T>(); auto oF = output_features.data<T>();
...@@ -20,7 +19,7 @@ template <typename T> ...@@ -20,7 +19,7 @@ template <typename T>
void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features, void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features,
/*float*/ at::Tensor d_input_features, /*float*/ at::Tensor d_input_features,
/*float*/ at::Tensor d_output_features, /*float*/ at::Tensor d_output_features,
float alpha) { T alpha) {
d_input_features.resize_as_(d_output_features); d_input_features.resize_as_(d_output_features);
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>(); auto diF = d_input_features.data<T>();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment