Commit 5f0860fc authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

DenseToSparse, tidying

parent 6de372c3
...@@ -35,6 +35,7 @@ if torch.cuda.is_available(): ...@@ -35,6 +35,7 @@ if torch.cuda.is_available():
'sparseconvnet/SCN/header_cpu.h', 'sparseconvnet/SCN/header_cpu.h',
'sparseconvnet/SCN/header_gpu.h'], 'sparseconvnet/SCN/header_gpu.h'],
sources=[], sources=[],
include_dirs=[os.path.expandvars('$CUDA_HOME') + '/include'],
extra_objects=[ extra_objects=[
this_dir + this_dir +
'/sparseconvnet/SCN/init.cu.o'], '/sparseconvnet/SCN/init.cu.o'],
......
...@@ -14,18 +14,20 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)( ...@@ -14,18 +14,20 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)(
THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar, THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
THTensor *weight, THTensor *bias, real eps, real momentum, bool train, THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
real leakiness) { real leakiness) {
THTensor_(resizeAs)(output_features, input_features); THTensor_(resizeAs)(output_features, input_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0]; auto output_stride = output_features->stride[0];
BatchNormalization_ForwardPass<real>( BatchNormalization_ForwardPass<real>(
THTensor_(data)(input_features), THTensor_(data)(output_features), THTensor_(data)(input_features), THTensor_(data)(output_features),
nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean), nPlanes, input_stride, output_stride, nActive,
THTensor_(data)(saveInvStd), THTensor_(data)(runningMean), THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
THTensor_(data)(runningVar), THOptionalTensorData(weight), THTensor_(data)(runningMean), THTensor_(data)(runningVar),
THOptionalTensorData(bias), eps, momentum, train, leakiness); THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
train, leakiness);
}
} }
extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)( extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
...@@ -34,6 +36,7 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)( ...@@ -34,6 +36,7 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
THTensor *weight, THTensor *bias, real eps, real momentum, bool train, THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
real leakiness) { real leakiness) {
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
...@@ -41,10 +44,12 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)( ...@@ -41,10 +44,12 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
BatchNormalization_ForwardPass<real>( BatchNormalization_ForwardPass<real>(
THTensor_(data)(input_features), THTensor_(data)(output_features), THTensor_(data)(input_features), THTensor_(data)(output_features),
nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean), nPlanes, input_stride, output_stride, nActive,
THTensor_(data)(saveInvStd), THTensor_(data)(runningMean), THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
THTensor_(data)(runningVar), THOptionalTensorData(weight), THTensor_(data)(runningMean), THTensor_(data)(runningVar),
THOptionalTensorData(bias), eps, momentum, train, leakiness); THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
train, leakiness);
}
} }
extern "C" void scn_R_(BatchNormalization_backward)( extern "C" void scn_R_(BatchNormalization_backward)(
...@@ -55,6 +60,7 @@ extern "C" void scn_R_(BatchNormalization_backward)( ...@@ -55,6 +60,7 @@ extern "C" void scn_R_(BatchNormalization_backward)(
real leakiness) { real leakiness) {
THTensor_(resizeAs)(d_input_features, input_features); THTensor_(resizeAs)(d_input_features, input_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
...@@ -62,10 +68,12 @@ extern "C" void scn_R_(BatchNormalization_backward)( ...@@ -62,10 +68,12 @@ extern "C" void scn_R_(BatchNormalization_backward)(
BatchNormalization_BackwardPass<real>( BatchNormalization_BackwardPass<real>(
THTensor_(data)(input_features), THTensor_(data)(d_input_features), THTensor_(data)(input_features), THTensor_(data)(d_input_features),
THTensor_(data)(output_features), THTensor_(data)(d_output_features), THTensor_(data)(output_features), THTensor_(data)(d_output_features),
nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean), nPlanes, input_stride, output_stride, nActive,
THTensor_(data)(saveInvStd), THTensor_(data)(runningMean), THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
THTensor_(data)(runningVar), THOptionalTensorData(weight), THTensor_(data)(runningMean), THTensor_(data)(runningVar),
THOptionalTensorData(bias), THOptionalTensorData(d_weight), THOptionalTensorData(weight), THOptionalTensorData(bias),
THOptionalTensorData(d_bias), leakiness); THOptionalTensorData(d_weight), THOptionalTensorData(d_bias),
leakiness);
}
} }
#endif #endif
...@@ -23,6 +23,8 @@ extern "C" double scn_DR_(Convolution_updateOutput)( ...@@ -23,6 +23,8 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
if (not bias) if (not bias)
THTensor_(zero)(output_features); THTensor_(zero)(output_features);
double flops = 0;
if (nActive) {
auto iF = THTensor_(data)(input_features); auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features); auto oF = THTensor_(data)(output_features);
auto ip = input_features->size[1]; auto ip = input_features->size[1];
...@@ -31,9 +33,9 @@ extern "C" double scn_DR_(Convolution_updateOutput)( ...@@ -31,9 +33,9 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
auto b = THOptionalTensorData(bias); auto b = THOptionalTensorData(bias);
Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive, Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
THBlas_(gemm)); THBlas_(gemm));
double flops = 0;
for (auto &r : _rules) for (auto &r : _rules)
flops += r.size() / 2 * ip * op; flops += r.size() / 2 * ip * op;
}
return flops; return flops;
} }
...@@ -51,6 +53,7 @@ extern "C" void scn_DR_(Convolution_backward)( ...@@ -51,6 +53,7 @@ extern "C" void scn_DR_(Convolution_backward)(
THTensor_(resizeAs)(d_input_features, input_features); THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features); THTensor_(zero)(d_input_features);
if (nActive) {
auto iF = THTensor_(data)(input_features); auto iF = THTensor_(data)(input_features);
auto diF = THTensor_(data)(d_input_features); auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features); auto doF = THTensor_(data)(d_output_features);
...@@ -62,6 +65,7 @@ extern "C" void scn_DR_(Convolution_backward)( ...@@ -62,6 +65,7 @@ extern "C" void scn_DR_(Convolution_backward)(
Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules, Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
nActive, THBlas_(gemm)); nActive, THBlas_(gemm));
}
} }
extern "C" double scn_DR_(ValidConvolution_updateOutput)( extern "C" double scn_DR_(ValidConvolution_updateOutput)(
...@@ -71,11 +75,13 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)( ...@@ -71,11 +75,13 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true); auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0]; uInt nActive = _m.getNActive(inputSize);
THTensor_(resize2d)(output_features, nActive, weight->size[1]); THTensor_(resize2d)(output_features, nActive, weight->size[1]);
if (not bias) if (not bias)
THTensor_(zero)(output_features); THTensor_(zero)(output_features);
double flops = 0;
if (nActive) {
auto iF = THTensor_(data)(input_features); auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features); auto oF = THTensor_(data)(output_features);
auto ip = input_features->size[1]; auto ip = input_features->size[1];
...@@ -85,10 +91,9 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)( ...@@ -85,10 +91,9 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive, Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
THBlas_(gemm)); THBlas_(gemm));
double flops = 0;
for (auto &r : _rules) for (auto &r : _rules)
flops += r.size() / 2 * ip * op; flops += r.size() / 2 * ip * op;
}
return flops; return flops;
} }
...@@ -100,10 +105,11 @@ extern "C" void scn_DR_(ValidConvolution_backward)( ...@@ -100,10 +105,11 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true); auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0]; uInt nActive = _m.getNActive(inputSize);
THTensor_(resizeAs)(d_input_features, input_features); THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features); THTensor_(zero)(d_input_features);
if (nActive) {
auto iF = THTensor_(data)(input_features); auto iF = THTensor_(data)(input_features);
auto diF = THTensor_(data)(d_input_features); auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features); auto doF = THTensor_(data)(d_output_features);
...@@ -115,6 +121,6 @@ extern "C" void scn_DR_(ValidConvolution_backward)( ...@@ -115,6 +121,6 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules, Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
nActive, THBlas_(gemm)); nActive, THBlas_(gemm));
}
} }
#endif #endif
...@@ -9,34 +9,32 @@ ...@@ -9,34 +9,32 @@
#else #else
#include "SparseToDense.h" #include "SparseToDense.h"
extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize, extern "C" void scn_DR_(SparseToDense_updateOutput)(
void **m, THLongTensor *inputSize, void **m, THTensor *input_features,
THTensor *input_features, THTensor *output_features, void *rulesBuffer, long nPlanes) {
THTensor *output_features,
void *rulesBuffer) { SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) { {
long sz[Dimension + 2]; long sz[Dimension + 2];
sz[0] = _m.inputSGs->size(); sz[0] = _m.grids.begin()->second.size();
sz[1] = input_features->size[1]; sz[1] = nPlanes; // input_features->size[1];
for (int i = 0; i < Dimension; i++) { std::memcpy(sz + 2, THLongTensor_data(inputSize), sizeof(long) * Dimension);
auto x = THLongTensor_data(inputSize)[i];
sz[i + 2] = x;
}
THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL); THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
THTensor_(zero)(output_features); THTensor_(zero)(output_features);
} }
if (input_features->nDimension == 2) {
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true); auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = input_features->size[1]; uInt nPlanes = input_features->size[1];
auto iF = THTensor_(data)(input_features); auto iF = THTensor_(data)(input_features);
auto oF = THTensor_(data)(output_features); auto oF = THTensor_(data)(output_features);
long spatialVolume = THLongTensor_prodall(inputSize);
for (auto &r : _rules) { for (auto &r : _rules) {
uInt nHot = r.size() / 2; uInt nHot = r.size() / 2;
SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0], SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0],
nHot); nHot);
oF++; oF += nPlanes * spatialVolume;
}
} }
} }
extern "C" void scn_DR_(SparseToDense_updateGradInput)( extern "C" void scn_DR_(SparseToDense_updateGradInput)(
...@@ -44,12 +42,12 @@ extern "C" void scn_DR_(SparseToDense_updateGradInput)( ...@@ -44,12 +42,12 @@ extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THTensor *d_input_features, THTensor *d_output_features, THTensor *d_input_features, THTensor *d_output_features,
void *rulesBuffer) { void *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
THTensor_(resizeAs)(d_input_features, input_features); THTensor_(resizeAs)(d_input_features, input_features);
THTensor_(zero)(d_input_features); THTensor_(zero)(d_input_features);
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true); auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size(); if (input_features->nDimension == 2) {
long spatialVolume = THLongTensor_prodall(inputSize);
uInt nPlanes = d_input_features->size[1]; uInt nPlanes = d_input_features->size[1];
auto diF = THTensor_(data)(d_input_features); auto diF = THTensor_(data)(d_input_features);
auto doF = THTensor_(data)(d_output_features); auto doF = THTensor_(data)(d_output_features);
...@@ -58,7 +56,8 @@ extern "C" void scn_DR_(SparseToDense_updateGradInput)( ...@@ -58,7 +56,8 @@ extern "C" void scn_DR_(SparseToDense_updateGradInput)(
uInt nHot = r.size() / 2; uInt nHot = r.size() / 2;
SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0], SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0],
nHot); nHot);
doF++; doF += nPlanes * spatialVolume;
}
} }
} }
#endif #endif
...@@ -10,27 +10,26 @@ ...@@ -10,27 +10,26 @@
template <typename T> template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features, void SparseToDense_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume, uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt* rules, int nHot) { int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) { for (uInt outSite = 0; outSite < nHot; outSite++) {
T *i = &input_features[rules[2 * outSite] * nPlanes]; T *i = input_features + rules[2 * outSite] * nPlanes;
uInt sample = rules[2 * outSite + 1]; T *o = output_features + rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++) for (uInt plane = 0; plane < nPlanes; plane++)
output_features[(sample*nPlanes+plane)*spatialVolume]=i[plane]; o[plane * spatialVolume] = i[plane];
} }
} }
template <typename T> template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features, void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume, uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt* rules, int nHot) { int nHot) {
for (uInt outSite = 0; outSite < nHot; outSite++) { for (uInt outSite = 0; outSite < nHot; outSite++) {
T *di = &d_input_features[rules[2 * outSite] * nPlanes]; T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
uInt sample = rules[2 * outSite + 1]; auto d_o = d_output_features + rules[2 * outSite + 1];
for (uInt plane = 0; plane < nPlanes; plane++) for (uInt plane = 0; plane < nPlanes; plane++)
di[plane]=d_output_features[(sample*nPlanes+plane)*spatialVolume]; d_i[plane] = d_o[plane * spatialVolume];
}
} }
}
#endif /* CPU_SPARSETODENSE_H */ #endif /* CPU_SPARSETODENSE_H */
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
#include "AffineReluTrivialConvolution.h" #include "AffineReluTrivialConvolution.h"
#include <algorithm> #include <algorithm>
#include <iostream>
extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)( extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *input_features, THCTensor *output_features,
......
...@@ -155,6 +155,27 @@ __global__ void dAffineReluTrivialConvolution_forwardB( ...@@ -155,6 +155,27 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
} }
} }
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>> \
(inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template <typename T> template <typename T>
void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures, void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
T *affineWeight, T *affineBias, T *affineWeight, T *affineBias,
...@@ -162,92 +183,25 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures, ...@@ -162,92 +183,25 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
uInt input_stride, uInt input_stride,
uInt output_nPlanes, uInt output_nPlanes,
uInt output_stride, uInt nActive) { uInt output_stride, uInt nActive) {
{
const uInt K = 64; FOO(T, 64, 16)
const uInt V = 16; FOO(T, 32, 8)
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { FOO(T, 16, 4)
uInt o = (nActive / K) * K; FOO(T, 8, 2)
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 32;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 16;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
{
const uInt K = 8;
const uInt V = 2;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_forwardA<
T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
if (nActive > o)
dAffineReluTrivialConvolution_forwardB<
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, outFeatures + o * output_stride,
affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
output_nPlanes, output_stride, nActive - o);
return;
}
}
assert(false); assert(false);
} }
template <>
void dAffineReluTrivialConvolution_forward<double>(
double *inFeatures, double *outFeatures, double *affineWeight,
double *affineBias, double *convWeight, uInt input_nPlanes,
uInt input_stride, uInt output_nPlanes, uInt output_stride, uInt nActive) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
...@@ -449,84 +403,41 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B( ...@@ -449,84 +403,41 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
atomicAdd(&dAffineBias[tx], dAB); atomicAdd(&dAffineBias[tx], dAB);
} }
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>> \
(inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template <typename T> template <typename T>
void dAffineReluTrivialConvolution_backward_dW( void dAffineReluTrivialConvolution_backward_dW(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight, T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight, T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes, T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride, uInt nActive, bool additiveGrad) { uInt output_stride, uInt nActive, bool additiveGrad) {
{ FOO(T, 32, 8)
const uInt K = 32; FOO(T, 16, 4)
const uInt V = 8; FOO(T, 8, 2)
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
{
const uInt K = 16;
const uInt V = 4;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
{
const uInt K = 8;
const uInt V = 2;
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
uInt o = (nActive / K) * K;
if (o > 0)
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, o, additiveGrad);
if (nActive > o)
dAffineReluTrivialConvolution_backward_dW_B<
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
THCState_getCurrentStream(state)>>>(
inFeatures + o * input_stride, dInFeatures + o * input_stride,
dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
input_stride, output_nPlanes, output_stride, nActive - o,
additiveGrad);
return;
}
}
} }
#undef FOO
#endif #endif
...@@ -30,13 +30,14 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)( ...@@ -30,13 +30,14 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)(
real leakiness) { real leakiness) {
THCTensor_(resizeAs)(state, output_features, input_features); THCTensor_(resizeAs)(state, output_features, input_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0]; auto output_stride = output_features->stride[0];
BN_F_MACRO(16) BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1) else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
} }
extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)( extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
...@@ -44,14 +45,14 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)( ...@@ -44,14 +45,14 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar, THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train, THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
real leakiness) { real leakiness) {
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0]; auto output_stride = output_features->stride[0];
BN_F_MACRO(16) BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1) else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
} }
#undef BN_F_MACRO #undef BN_F_MACRO
...@@ -81,12 +82,13 @@ extern "C" void scn_R_(BatchNormalization_backward)( ...@@ -81,12 +82,13 @@ extern "C" void scn_R_(BatchNormalization_backward)(
THCTensor *d_weight, THCTensor *d_bias, real leakiness) { THCTensor *d_weight, THCTensor *d_bias, real leakiness) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features); THCTensor_(resizeAs)(state, d_input_features, d_output_features);
if (input_features->nDimension == 2) {
auto nActive = input_features->size[0]; auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1]; auto nPlanes = input_features->size[1];
auto input_stride = input_features->stride[0]; auto input_stride = input_features->stride[0];
auto output_stride = output_features->stride[0]; auto output_stride = output_features->stride[0];
BN_B_MACRO(16) BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1) else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
} }
#endif #endif
...@@ -25,12 +25,13 @@ extern "C" double scn_DR_(Convolution_updateOutput)( ...@@ -25,12 +25,13 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
if (not bias) if (not bias)
THCTensor_(zero)(state, output_features); THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features); auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features); auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1]; auto ip = input_features->size[1];
auto op = output_features->size[1]; auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight); auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) { if (bias) {
auto b = THCTensor_(data)(state, bias); auto b = THCTensor_(data)(state, bias);
...@@ -47,6 +48,7 @@ extern "C" double scn_DR_(Convolution_updateOutput)( ...@@ -47,6 +48,7 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op, dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
}
return flops; return flops;
} }
...@@ -63,6 +65,7 @@ extern "C" void scn_DR_(Convolution_backward)( ...@@ -63,6 +65,7 @@ extern "C" void scn_DR_(Convolution_backward)(
THCTensor_(resizeAs)(state, d_input_features, input_features); THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features); THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features); auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features); auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features); auto doF = THCTensor_(data)(state, d_output_features);
...@@ -71,9 +74,9 @@ extern "C" void scn_DR_(Convolution_backward)( ...@@ -71,9 +74,9 @@ extern "C" void scn_DR_(Convolution_backward)(
auto w = THCTensor_(data)(state, weight); auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight); auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op; uInt c = ip * op;
RULEBOOKITERATOR( RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip, iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
op, op, THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
, w += c; dw += c;) , w += c; dw += c;)
if (d_bias) { if (d_bias) {
...@@ -81,6 +84,7 @@ extern "C" void scn_DR_(Convolution_backward)( ...@@ -81,6 +84,7 @@ extern "C" void scn_DR_(Convolution_backward)(
Convolution_bp_bias(doF, db, op, op, nActive, Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
} }
}
} }
extern "C" double scn_DR_(ValidConvolution_updateOutput)( extern "C" double scn_DR_(ValidConvolution_updateOutput)(
...@@ -89,17 +93,18 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)( ...@@ -89,17 +93,18 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) { THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true); auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0]; uInt nActive = _m.getNActive(inputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]); THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias) if (not bias)
THCTensor_(zero)(state, output_features); THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features); auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features); auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1]; auto ip = input_features->size[1];
auto op = output_features->size[1]; auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight); auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) { if (bias) {
auto b = THCTensor_(data)(state, bias); auto b = THCTensor_(data)(state, bias);
...@@ -116,6 +121,7 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)( ...@@ -116,6 +121,7 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op, dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
}
return flops; return flops;
} }
...@@ -126,10 +132,11 @@ extern "C" void scn_DR_(ValidConvolution_backward)( ...@@ -126,10 +132,11 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) { THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getValidRuleBook(inputSize, filterSize, true); auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
uInt nActive = input_features->size[0]; uInt nActive = _m.getNActive(inputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features); THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features); THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features); auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features); auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features); auto doF = THCTensor_(data)(state, d_output_features);
...@@ -138,9 +145,9 @@ extern "C" void scn_DR_(ValidConvolution_backward)( ...@@ -138,9 +145,9 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
auto w = THCTensor_(data)(state, weight); auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight); auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op; uInt c = ip * op;
RULEBOOKITERATOR( RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip, iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
op, op, THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
, w += c; dw += c;) , w += c; dw += c;)
if (d_bias) { if (d_bias) {
...@@ -148,6 +155,7 @@ extern "C" void scn_DR_(ValidConvolution_backward)( ...@@ -148,6 +155,7 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
Convolution_bp_bias(doF, db, op, op, nActive, Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state)); THCState_getCurrentStream(state));
} }
}
} }
#endif #endif
...@@ -184,7 +184,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules, ...@@ -184,7 +184,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
} }
} }
#define FOO(K, V) \ #define FOO(T, K, V) \
{ \ { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \ uInt o = (nHot / K) * K; \
...@@ -208,10 +208,21 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules, ...@@ -208,10 +208,21 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride, uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) { cudaStream_t stream) {
FOO(64, 16) FOO(T, 64, 16)
FOO(32, 8) FOO(T, 32, 8)
FOO(16, 4) FOO(T, 16, 4)
FOO(8, 2) FOO(T, 8, 2)
assert(false);
}
template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false); assert(false);
} }
#undef FOO #undef FOO
...@@ -378,7 +389,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -378,7 +389,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
} }
} }
#define FOO(K, V) \ #define FOO(T, K, V) \
{ \ { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \ uInt o = (nHot / K) * K; \
...@@ -404,9 +415,9 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -404,9 +415,9 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
uInt input_nPlanes, uInt input_stride, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride, uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) { cudaStream_t stream) {
FOO(32, 8) FOO(T, 32, 8)
FOO(16, 4) FOO(T, 16, 4)
FOO(8, 2) FOO(T, 8, 2)
assert(false); assert(false);
} }
#undef FOO #undef FOO
......
...@@ -153,7 +153,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules, ...@@ -153,7 +153,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
} }
} }
#define FOO(K, V) \ #define FOO(T, K, V) \
{ \ { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \ uInt o = (nHot / K) * K; \
...@@ -177,10 +177,21 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules, ...@@ -177,10 +177,21 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride, uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) { cudaStream_t stream) {
FOO(64, 16) FOO(T, 64, 16)
FOO(32, 8) FOO(T, 32, 8)
FOO(16, 4) FOO(T, 16, 4)
FOO(8, 2) FOO(T, 8, 2)
assert(false);
}
template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false); assert(false);
} }
#undef FOO #undef FOO
...@@ -345,7 +356,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -345,7 +356,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
} }
} }
#define FOO(K, V) \ #define FOO(T, K, V) \
{ \ { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \ uInt o = (nHot / K) * K; \
...@@ -371,9 +382,9 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -371,9 +382,9 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
uInt input_nPlanes, uInt input_stride, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride, uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) { cudaStream_t stream) {
FOO(32, 8) FOO(T, 32, 8)
FOO(16, 4) FOO(T, 16, 4)
FOO(8, 2) FOO(T, 8, 2)
assert(false); assert(false);
} }
#undef FOO #undef FOO
......
...@@ -9,50 +9,54 @@ ...@@ -9,50 +9,54 @@
#else #else
#include "SparseToDense.h" #include "SparseToDense.h"
extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize, extern "C" void scn_DR_(SparseToDense_updateOutput)(
void **m, THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *input_features, THCTensor *output_features, THCITensor *rulesBuffer, long nPlanes) {
THCTensor *output_features,
THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) { SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
long spatialVolume = 1;
{
long sz[Dimension + 2]; long sz[Dimension + 2];
sz[0] = _m.inputSGs->size(); sz[0] = _m.grids.begin()->second.size();
sz[1] = input_features->size[1]; sz[1] = nPlanes; // input_features->size[1];
for (int i = 0; i < Dimension; i++) { for (int i = 0; i < Dimension; i++) {
auto x = THLongTensor_data(inputSize)[i]; auto x = THLongTensor_data(inputSize)[i];
sz[i + 2] = x; sz[i + 2] = x;
spatialVolume *= x;
} }
THCTensor_(resizeNd)(state, output_features, Dimension + 2, sz, NULL); THCTensor_(resizeNd)(state, output_features, Dimension + 2, sz, NULL);
THCTensor_(zero)(state, output_features); THCTensor_(zero)(state, output_features);
} }
if (input_features->nDimension == 2) {
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true); auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = input_features->size[1]; uInt nPlanes = input_features->size[1];
auto iF = THCTensor_(data)(state, input_features); auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features); auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR( RULEBOOKITERATOR(
SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF, SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF,
nPlanes, spatialVolume, rbB, nHotB); oF, nPlanes, spatialVolume, rbB, nHotB);
, oF++;) // todo check ++ or +=spatialVolume????zzz , oF += nPlanes * spatialVolume;)
}
} }
extern "C" void scn_DR_(SparseToDense_updateGradInput)( extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THLongTensor *inputSize, void **m, THCTensor *input_features, THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features, THCTensor *d_input_features, THCTensor *d_output_features,
THCITensor *rulesBuffer) { THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
THCTensor_(resizeAs)(state, d_input_features, input_features); THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features); THCTensor_(zero)(state, d_input_features);
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) if (input_features->nDimension == 2) {
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true); auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size(); long spatialVolume = THLongTensor_prodall(inputSize);
uInt nPlanes = d_input_features->size[1]; uInt nPlanes = d_input_features->size[1];
auto diF = THCTensor_(data)(state, d_input_features); auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features); auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR( RULEBOOKITERATOR(SparseToDense_BackwardPass<real>(
SparseToDense_BackwardPass<real>(THCState_getCurrentStream(state), diF, THCState_getCurrentStream(state), diF, doF, nPlanes,
doF, nPlanes, spatialVolume, rbB, nHotB); spatialVolume, rbB, nHotB);
, doF++;) , doF += nPlanes * spatialVolume;)
}
} }
#endif #endif
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
// NTX must be >=2 so r is filled properly // NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY> template <typename T, uInt NTX, uInt NTY>
__global__ void SparseToDense_fp(T *input_features, T *output_features, __global__ void SparseToDense_fp(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) { uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt nHot) {
__shared__ uInt r[NTY * 2]; __shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{ {
...@@ -22,10 +23,10 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features, ...@@ -22,10 +23,10 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
} }
__syncthreads(); __syncthreads();
if (n + threadIdx.y < nHot) { if (n + threadIdx.y < nHot) {
T *i = &input_features[r[2 * threadIdx.y] * nPlanes]; T *i = input_features + r[2 * threadIdx.y] * nPlanes;
T *o = &output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes]; T *o = output_features + r[2 * threadIdx.y + 1];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
o[plane*spatialVolume]=i[plane]; o[plane * spatialVolume] = i[plane];
} }
__syncthreads(); __syncthreads();
} }
...@@ -34,15 +35,15 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features, ...@@ -34,15 +35,15 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
template <typename T> template <typename T>
void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features, void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes, T *output_features, uInt nPlanes,
uInt spatialVolume, uInt spatialVolume, uInt *rules, uInt nHot) {
uInt *rules, uInt nHot) { SparseToDense_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( (input_features, output_features, nPlanes, spatialVolume, rules, nHot);
input_features, output_features, nPlanes, spatialVolume, rules, nHot);
} }
// NTX must be >=2 so r is filled properly // NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY> template <typename T, uInt NTX, uInt NTY>
__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features, __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) { uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt nHot) {
__shared__ uInt r[NTY * 2]; __shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{ {
...@@ -52,10 +53,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features, ...@@ -52,10 +53,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
} }
__syncthreads(); __syncthreads();
if (n + threadIdx.y < nHot) { if (n + threadIdx.y < nHot) {
T *i = &d_input_features[r[2 * threadIdx.y] * nPlanes]; T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
T *o = &d_output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes]; T *d_o = d_output_features + r[2 * threadIdx.y + 1];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
i[plane]=o[plane*spatialVolume]; d_i[plane] = d_o[plane * spatialVolume];
} }
__syncthreads(); __syncthreads();
} }
...@@ -64,9 +65,9 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features, ...@@ -64,9 +65,9 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
template <typename T> template <typename T>
void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features, void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes, T *d_output_features, uInt nPlanes,
uInt spatialVolume, uInt spatialVolume, uInt *rules, uInt nHot) {
uInt *rules, uInt nHot) { SparseToDense_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( (d_input_features, d_output_features, nPlanes, spatialVolume, rules,
d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot); nHot);
} }
#endif /* GPU_SPARSETODENSE_H */ #endif /* GPU_SPARSETODENSE_H */
...@@ -27,4 +27,22 @@ ...@@ -27,4 +27,22 @@
#undef TH_REAL_IS_FLOAT #undef TH_REAL_IS_FLOAT
#undef THBLAS_GEMM #undef THBLAS_GEMM
// double
// #define real double
// #define accreal double
// #define Real Double
// #define CReal CudaDouble
// #define TH_REAL_IS_DOUBLE
// #define THBLAS_GEMM THCudaBlas_Dgemm
// #line 1 TH_GENERIC_FILE
// #include TH_GENERIC_FILE
// #undef accreal
// #undef real
// #undef Real
// #undef CReal
// #undef TH_REAL_IS_DOUBLE
// #undef THBLAS_GEMM
#undef TH_GENERIC_FILE #undef TH_GENERIC_FILE
...@@ -103,20 +103,26 @@ uInt Convolution_InputSgsToRulesAndOutputSgs_OMP( ...@@ -103,20 +103,26 @@ uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(
return output_nActive; return output_nActive;
} }
// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs // for each active site, list of (inputFeatureNumber,batchIdx, spatialOffset)
// triples
template <uInt dimension> template <uInt dimension>
void SparseToDense_InputSgsToRulesAndOutputSgs( void SparseToDense_InputSgsToRulesAndOutputSgs(
SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) { SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
uInt batchSize = input_SGs.size(); uInt batchSize = input_SGs.size();
SparseGrids<dimension> output_SGs(batchSize);
std::vector<long> ones(dimension, 1);
rules.clear(); rules.clear();
for (uInt i = 0; i < batchSize; i++) { rules.resize(batchSize);
auto &iSG = input_SGs[i]; Point<dimension> lb, ub;
auto &oSG = output_SGs[i]; for (int i = 0; i < dimension; ++i) {
oSG.ctr = i; // batchIdx lb[i] = 0;
Convolution_InputSgToRulesAndOutputSg<dimension>( ub[i] = spatialSize[i] - 1;
iSG, oSG, rules, spatialSize, &ones[0], spatialSize, &ones[0]); }
auto region = RectangularRegion<dimension>(lb, ub);
for (uInt batchIdx = 0; batchIdx < batchSize; batchIdx++) {
auto &iSG = input_SGs[batchIdx];
for (auto const &inIter : iSG.mp) {
rules[batchIdx].push_back(inIter.second + iSG.ctr);
rules[batchIdx].push_back(region.offset(inIter.first));
}
} }
} }
...@@ -124,33 +130,21 @@ template <uInt dimension> ...@@ -124,33 +130,21 @@ template <uInt dimension>
void SparseToDense_InputSgsToRulesAndOutputSgs_OMP( void SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) { SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
uInt batchSize = input_SGs.size(); uInt batchSize = input_SGs.size();
SparseGrids<dimension> output_SGs(batchSize);
std::vector<long> ones(dimension, 1);
rules.clear(); rules.clear();
rules.resize(volume<dimension>(spatialSize)); rules.resize(batchSize);
std::vector<RuleBook> rbs(batchSize); Point<dimension> lb, ub;
{ for (int i = 0; i < dimension; ++i) {
uInt i; lb[i] = 0;
#pragma omp parallel for private(i) ub[i] = spatialSize[i] - 1;
for (i = 0; i < batchSize; i++) { }
output_SGs[i].ctr = i; // batchIdx auto region = RectangularRegion<dimension>(lb, ub);
Convolution_InputSgToRulesAndOutputSg<dimension>( uInt batchIdx;
input_SGs[i], output_SGs[i], rbs[i], spatialSize, &ones[0], #pragma omp parallel for private(batchIdx)
spatialSize, &ones[0]); for (batchIdx = 0; batchIdx < batchSize; batchIdx++) {
} auto &iSG = input_SGs[batchIdx];
} for (auto const &inIter : iSG.mp) {
{ rules[batchIdx].push_back(inIter.second + iSG.ctr);
uInt i; rules[batchIdx].push_back(region.offset(inIter.first));
#pragma omp parallel for private(i)
for (i = 0; i < rules.size(); i++) {
auto &R = rules[i];
for (uInt j = 0; j < batchSize; j++) {
auto &r = rbs[j][i];
for (uInt k = 0; k < r.size();) {
R.push_back(r[k++]);
R.push_back(r[k++]);
}
}
} }
} }
} }
......
...@@ -125,16 +125,15 @@ extern "C" void scn_D_(getSpatialLocations)(void **m, THLongTensor *spatialSize, ...@@ -125,16 +125,15 @@ extern "C" void scn_D_(getSpatialLocations)(void **m, THLongTensor *spatialSize,
} }
extern "C" void extern "C" void
scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_, scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
THLongTensor *pad_, THLongTensor *nz_, THLongTensor *nz_, long batchSize) {
long batchSize) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.clear();
_m.setInputSpatialSize(spatialSize_); _m.setInputSpatialSize(spatialSize_);
_m.inputSGs->resize(batchSize); _m.inputSGs->resize(batchSize);
auto &nActive = *_m.inputNActive; auto &nActive = *_m.inputNActive;
nActive = nz_->size[0]; nActive = nz_->size[0];
auto nz = THLongTensor_data(nz_); auto nz = THLongTensor_data(nz_);
auto pad = THLongTensor_data(pad_);
auto spatialSize = THLongTensor_data(spatialSize_); auto spatialSize = THLongTensor_data(spatialSize_);
std::vector<uInt> br(batchSize + 1); std::vector<uInt> br(batchSize + 1);
...@@ -157,8 +156,7 @@ scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_, ...@@ -157,8 +156,7 @@ scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
for (uInt i = br[b]; i < br[b + 1]; i++) { for (uInt i = br[b]; i < br[b + 1]; i++) {
Point<Dimension> x; Point<Dimension> x;
for (uInt j = 0; j < Dimension; j++) { for (uInt j = 0; j < Dimension; j++) {
x[j] = nz[i * (Dimension + 1) + j + 1] + x[j] = nz[i * (Dimension + 1) + j + 1]; // 0-indexed
pad[b * Dimension + j]; // 0-indexed
} }
sg.mp[x] = i; sg.mp[x] = i;
} }
...@@ -281,6 +279,7 @@ extern "C" void scn_D_(generateRuleBooks2s2)(void **m) { ...@@ -281,6 +279,7 @@ extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
p2[i] = p3[i] = inS[i] = outS[i]; p2[i] = p3[i] = inS[i] = outS[i];
} }
} }
extern "C" void scn_D_(freeMetadata)(void **m) { extern "C" void scn_D_(freeMetadata)(void **m) {
SCN_DELETE(Metadata<Dimension>, m) SCN_DELETE(Metadata<Dimension>, m)
} }
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "ActivePoolingRules.h" #include "ActivePoolingRules.h"
#include "ConvolutionRules.h" #include "ConvolutionRules.h"
#include "ValidConvolutionRules.h" #include "ValidConvolutionRules.h"
#include <iostream>
#include <tuple> #include <tuple>
#include <unordered_map> #include <unordered_map>
...@@ -40,6 +39,18 @@ public: ...@@ -40,6 +39,18 @@ public:
uInt *inputNActive; uInt *inputNActive;
Metadata() {} Metadata() {}
void clear() {
nActive.clear();
grids.clear();
activePoolingRuleBooks.clear();
validRuleBooks.clear();
ruleBooks.clear();
sparseToDenseRuleBooks.clear();
inputSGs = nullptr;
inputSG = nullptr;
inputNActive = nullptr;
}
void setInputSpatialSize(THLongTensor *spatialSize) { void setInputSpatialSize(THLongTensor *spatialSize) {
inputSpatialSize = LongTensorToPoint<dimension>(spatialSize); inputSpatialSize = LongTensorToPoint<dimension>(spatialSize);
inputSGs = &grids[inputSpatialSize]; inputSGs = &grids[inputSpatialSize];
......
...@@ -6,8 +6,6 @@ ...@@ -6,8 +6,6 @@
#ifndef VALIDCONVOLUTIONRULES_H #ifndef VALIDCONVOLUTIONRULES_H
#define VALIDCONVOLUTIONRULES_H #define VALIDCONVOLUTIONRULES_H
#include<iostream>
// Full input region for an output point // Full input region for an output point
template <uInt dimension> template <uInt dimension>
...@@ -26,8 +24,8 @@ InputRegionCalculator_Valid(const Point<dimension> &output, long *size) { ...@@ -26,8 +24,8 @@ InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
// rules is used to carry out the "lowering" whilst carrying out the convolution // rules is used to carry out the "lowering" whilst carrying out the convolution
template <uInt dimension> template <uInt dimension>
double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
RuleBook &rules, long *size) { long *size) {
uInt sd = volume<dimension>(size); uInt sd = volume<dimension>(size);
double countActiveInputs = 0; double countActiveInputs = 0;
for (auto const &outputIter : grid.mp) { for (auto const &outputIter : grid.mp) {
...@@ -48,8 +46,8 @@ double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, ...@@ -48,8 +46,8 @@ double ValidConvolution_SgToRules(SparseGrid<dimension> &grid,
} }
template <uInt dimension> template <uInt dimension>
uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs, uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
RuleBook &rules, long *size) { long *size) {
uInt sd = volume<dimension>(size); uInt sd = volume<dimension>(size);
uInt countActiveInputs = 0; uInt countActiveInputs = 0;
rules.clear(); rules.clear();
......
...@@ -8,13 +8,12 @@ long scn_readPtr(void **ptr); ...@@ -8,13 +8,12 @@ long scn_readPtr(void **ptr);
void scn_writePtr(long p, void **ptr); void scn_writePtr(long p, void **ptr);
double scn_ruleBookBits(void); double scn_ruleBookBits(void);
void scn_2_drawCurve(void **m, THFloatTensor *features, THFloatTensor *stroke); void scn_2_drawCurve(void **m, THFloatTensor *features, THFloatTensor *stroke);
double scn_1_addSampleFromThresholdedTensor( double scn_1_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_1_batchAddSample(void **m); void scn_1_batchAddSample(void **m);
void scn_1_createMetadataForDenseToSparse( void scn_1_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_1_freeMetadata(void **metadata); void scn_1_freeMetadata(void **metadata);
void scn_1_generateRuleBooks3s2(void **m); void scn_1_generateRuleBooks3s2(void **m);
void scn_1_generateRuleBooks2s2(void **m); void scn_1_generateRuleBooks2s2(void **m);
...@@ -25,13 +24,13 @@ void scn_1_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -25,13 +24,13 @@ void scn_1_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_1_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_1_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_2_addSampleFromThresholdedTensor(
double scn_2_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_2_batchAddSample(void **m); void scn_2_batchAddSample(void **m);
void scn_2_createMetadataForDenseToSparse( void scn_2_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_2_freeMetadata(void **metadata); void scn_2_freeMetadata(void **metadata);
void scn_2_generateRuleBooks3s2(void **m); void scn_2_generateRuleBooks3s2(void **m);
void scn_2_generateRuleBooks2s2(void **m); void scn_2_generateRuleBooks2s2(void **m);
...@@ -42,13 +41,13 @@ void scn_2_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -42,13 +41,13 @@ void scn_2_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_2_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_2_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_3_addSampleFromThresholdedTensor(
double scn_3_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_3_batchAddSample(void **m); void scn_3_batchAddSample(void **m);
void scn_3_createMetadataForDenseToSparse( void scn_3_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_3_freeMetadata(void **metadata); void scn_3_freeMetadata(void **metadata);
void scn_3_generateRuleBooks3s2(void **m); void scn_3_generateRuleBooks3s2(void **m);
void scn_3_generateRuleBooks2s2(void **m); void scn_3_generateRuleBooks2s2(void **m);
...@@ -59,13 +58,13 @@ void scn_3_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -59,13 +58,13 @@ void scn_3_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_3_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_3_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_4_addSampleFromThresholdedTensor(
double scn_4_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_4_batchAddSample(void **m); void scn_4_batchAddSample(void **m);
void scn_4_createMetadataForDenseToSparse( void scn_4_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_4_freeMetadata(void **metadata); void scn_4_freeMetadata(void **metadata);
void scn_4_generateRuleBooks3s2(void **m); void scn_4_generateRuleBooks3s2(void **m);
void scn_4_generateRuleBooks2s2(void **m); void scn_4_generateRuleBooks2s2(void **m);
...@@ -76,13 +75,13 @@ void scn_4_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -76,13 +75,13 @@ void scn_4_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_4_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_4_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_5_addSampleFromThresholdedTensor(
double scn_5_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_5_batchAddSample(void **m); void scn_5_batchAddSample(void **m);
void scn_5_createMetadataForDenseToSparse( void scn_5_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_5_freeMetadata(void **metadata); void scn_5_freeMetadata(void **metadata);
void scn_5_generateRuleBooks3s2(void **m); void scn_5_generateRuleBooks3s2(void **m);
void scn_5_generateRuleBooks2s2(void **m); void scn_5_generateRuleBooks2s2(void **m);
...@@ -93,13 +92,13 @@ void scn_5_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -93,13 +92,13 @@ void scn_5_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_5_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_5_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_6_addSampleFromThresholdedTensor(
double scn_6_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_6_batchAddSample(void **m); void scn_6_batchAddSample(void **m);
void scn_6_createMetadataForDenseToSparse( void scn_6_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_6_freeMetadata(void **metadata); void scn_6_freeMetadata(void **metadata);
void scn_6_generateRuleBooks3s2(void **m); void scn_6_generateRuleBooks3s2(void **m);
void scn_6_generateRuleBooks2s2(void **m); void scn_6_generateRuleBooks2s2(void **m);
...@@ -110,13 +109,13 @@ void scn_6_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -110,13 +109,13 @@ void scn_6_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_6_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_6_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_7_addSampleFromThresholdedTensor(
double scn_7_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_7_batchAddSample(void **m); void scn_7_batchAddSample(void **m);
void scn_7_createMetadataForDenseToSparse( void scn_7_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_7_freeMetadata(void **metadata); void scn_7_freeMetadata(void **metadata);
void scn_7_generateRuleBooks3s2(void **m); void scn_7_generateRuleBooks3s2(void **m);
void scn_7_generateRuleBooks2s2(void **m); void scn_7_generateRuleBooks2s2(void **m);
...@@ -127,13 +126,13 @@ void scn_7_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -127,13 +126,13 @@ void scn_7_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_7_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_7_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_8_addSampleFromThresholdedTensor(
double scn_8_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_8_batchAddSample(void **m); void scn_8_batchAddSample(void **m);
void scn_8_createMetadataForDenseToSparse( void scn_8_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_8_freeMetadata(void **metadata); void scn_8_freeMetadata(void **metadata);
void scn_8_generateRuleBooks3s2(void **m); void scn_8_generateRuleBooks3s2(void **m);
void scn_8_generateRuleBooks2s2(void **m); void scn_8_generateRuleBooks2s2(void **m);
...@@ -144,13 +143,13 @@ void scn_8_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -144,13 +143,13 @@ void scn_8_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_8_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_8_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_9_addSampleFromThresholdedTensor(
double scn_9_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_9_batchAddSample(void **m); void scn_9_batchAddSample(void **m);
void scn_9_createMetadataForDenseToSparse( void scn_9_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_9_freeMetadata(void **metadata); void scn_9_freeMetadata(void **metadata);
void scn_9_generateRuleBooks3s2(void **m); void scn_9_generateRuleBooks3s2(void **m);
void scn_9_generateRuleBooks2s2(void **m); void scn_9_generateRuleBooks2s2(void **m);
...@@ -161,13 +160,13 @@ void scn_9_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -161,13 +160,13 @@ void scn_9_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_9_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_9_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
double scn_10_addSampleFromThresholdedTensor(
double scn_10_addSampleFromThresholdedTensor(
void **m, THFloatTensor *features_, THFloatTensor *tensor_, void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold); THLongTensor *offset_, THLongTensor *spatialSize_, float threshold);
void scn_10_batchAddSample(void **m); void scn_10_batchAddSample(void **m);
void scn_10_createMetadataForDenseToSparse( void scn_10_createMetadataForDenseToSparse(
void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz, void **m, THLongTensor *spatialSize_, THLongTensor *nz, long batchSize);
long batchSize);
void scn_10_freeMetadata(void **metadata); void scn_10_freeMetadata(void **metadata);
void scn_10_generateRuleBooks3s2(void **m); void scn_10_generateRuleBooks3s2(void **m);
void scn_10_generateRuleBooks2s2(void **m); void scn_10_generateRuleBooks2s2(void **m);
...@@ -178,7 +177,8 @@ void scn_10_setInputSpatialLocations(void **m, THFloatTensor *features, ...@@ -178,7 +177,8 @@ void scn_10_setInputSpatialLocations(void **m, THFloatTensor *features,
THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite); THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite);
void scn_10_getSpatialLocations(void **m, THLongTensor *spatialSize, void scn_10_getSpatialLocations(void **m, THLongTensor *spatialSize,
THLongTensor *locations); THLongTensor *locations);
void scn_cpu_float_AffineReluTrivialConvolution_updateOutput(
void scn_cpu_float_AffineReluTrivialConvolution_updateOutput(
THFloatTensor *input_features, THFloatTensor *output_features, THFloatTensor *input_features, THFloatTensor *output_features,
THFloatTensor *affineWeight, THFloatTensor *affineBias, THFloatTensor *convWeight); THFloatTensor *affineWeight, THFloatTensor *affineBias, THFloatTensor *convWeight);
void scn_cpu_float_AffineReluTrivialConvolution_backward( void scn_cpu_float_AffineReluTrivialConvolution_backward(
...@@ -352,7 +352,7 @@ void scn_cpu_float1MaxPooling_updateGradInput( ...@@ -352,7 +352,7 @@ void scn_cpu_float1MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float1SparseToDense_updateOutput( void scn_cpu_float1SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float1SparseToDense_updateGradInput( void scn_cpu_float1SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -429,7 +429,7 @@ void scn_cpu_float2MaxPooling_updateGradInput( ...@@ -429,7 +429,7 @@ void scn_cpu_float2MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float2SparseToDense_updateOutput( void scn_cpu_float2SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float2SparseToDense_updateGradInput( void scn_cpu_float2SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -506,7 +506,7 @@ void scn_cpu_float3MaxPooling_updateGradInput( ...@@ -506,7 +506,7 @@ void scn_cpu_float3MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float3SparseToDense_updateOutput( void scn_cpu_float3SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float3SparseToDense_updateGradInput( void scn_cpu_float3SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -583,7 +583,7 @@ void scn_cpu_float4MaxPooling_updateGradInput( ...@@ -583,7 +583,7 @@ void scn_cpu_float4MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float4SparseToDense_updateOutput( void scn_cpu_float4SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float4SparseToDense_updateGradInput( void scn_cpu_float4SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -660,7 +660,7 @@ void scn_cpu_float5MaxPooling_updateGradInput( ...@@ -660,7 +660,7 @@ void scn_cpu_float5MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float5SparseToDense_updateOutput( void scn_cpu_float5SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float5SparseToDense_updateGradInput( void scn_cpu_float5SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -737,7 +737,7 @@ void scn_cpu_float6MaxPooling_updateGradInput( ...@@ -737,7 +737,7 @@ void scn_cpu_float6MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float6SparseToDense_updateOutput( void scn_cpu_float6SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float6SparseToDense_updateGradInput( void scn_cpu_float6SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -814,7 +814,7 @@ void scn_cpu_float7MaxPooling_updateGradInput( ...@@ -814,7 +814,7 @@ void scn_cpu_float7MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float7SparseToDense_updateOutput( void scn_cpu_float7SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float7SparseToDense_updateGradInput( void scn_cpu_float7SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -891,7 +891,7 @@ void scn_cpu_float8MaxPooling_updateGradInput( ...@@ -891,7 +891,7 @@ void scn_cpu_float8MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float8SparseToDense_updateOutput( void scn_cpu_float8SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float8SparseToDense_updateGradInput( void scn_cpu_float8SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -968,7 +968,7 @@ void scn_cpu_float9MaxPooling_updateGradInput( ...@@ -968,7 +968,7 @@ void scn_cpu_float9MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float9SparseToDense_updateOutput( void scn_cpu_float9SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float9SparseToDense_updateGradInput( void scn_cpu_float9SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -1045,7 +1045,8 @@ void scn_cpu_float10MaxPooling_updateGradInput( ...@@ -1045,7 +1045,8 @@ void scn_cpu_float10MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_float10SparseToDense_updateOutput( void scn_cpu_float10SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *output_features, void *rulesBuffer); THFloatTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_float10SparseToDense_updateGradInput( void scn_cpu_float10SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THFloatTensor *input_features, THLongTensor *inputSize, void **m, THFloatTensor *input_features,
THFloatTensor *d_input_features, THFloatTensor *d_output_features, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
...@@ -1122,7 +1123,7 @@ void scn_cpu_double1MaxPooling_updateGradInput( ...@@ -1122,7 +1123,7 @@ void scn_cpu_double1MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double1SparseToDense_updateOutput( void scn_cpu_double1SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double1SparseToDense_updateGradInput( void scn_cpu_double1SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1199,7 +1200,7 @@ void scn_cpu_double2MaxPooling_updateGradInput( ...@@ -1199,7 +1200,7 @@ void scn_cpu_double2MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double2SparseToDense_updateOutput( void scn_cpu_double2SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double2SparseToDense_updateGradInput( void scn_cpu_double2SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1276,7 +1277,7 @@ void scn_cpu_double3MaxPooling_updateGradInput( ...@@ -1276,7 +1277,7 @@ void scn_cpu_double3MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double3SparseToDense_updateOutput( void scn_cpu_double3SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double3SparseToDense_updateGradInput( void scn_cpu_double3SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1353,7 +1354,7 @@ void scn_cpu_double4MaxPooling_updateGradInput( ...@@ -1353,7 +1354,7 @@ void scn_cpu_double4MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double4SparseToDense_updateOutput( void scn_cpu_double4SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double4SparseToDense_updateGradInput( void scn_cpu_double4SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1430,7 +1431,7 @@ void scn_cpu_double5MaxPooling_updateGradInput( ...@@ -1430,7 +1431,7 @@ void scn_cpu_double5MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double5SparseToDense_updateOutput( void scn_cpu_double5SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double5SparseToDense_updateGradInput( void scn_cpu_double5SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1507,7 +1508,7 @@ void scn_cpu_double6MaxPooling_updateGradInput( ...@@ -1507,7 +1508,7 @@ void scn_cpu_double6MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double6SparseToDense_updateOutput( void scn_cpu_double6SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double6SparseToDense_updateGradInput( void scn_cpu_double6SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1584,7 +1585,7 @@ void scn_cpu_double7MaxPooling_updateGradInput( ...@@ -1584,7 +1585,7 @@ void scn_cpu_double7MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double7SparseToDense_updateOutput( void scn_cpu_double7SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double7SparseToDense_updateGradInput( void scn_cpu_double7SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1661,7 +1662,7 @@ void scn_cpu_double8MaxPooling_updateGradInput( ...@@ -1661,7 +1662,7 @@ void scn_cpu_double8MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double8SparseToDense_updateOutput( void scn_cpu_double8SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double8SparseToDense_updateGradInput( void scn_cpu_double8SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1738,7 +1739,7 @@ void scn_cpu_double9MaxPooling_updateGradInput( ...@@ -1738,7 +1739,7 @@ void scn_cpu_double9MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double9SparseToDense_updateOutput( void scn_cpu_double9SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double9SparseToDense_updateGradInput( void scn_cpu_double9SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
...@@ -1815,7 +1816,7 @@ void scn_cpu_double10MaxPooling_updateGradInput( ...@@ -1815,7 +1816,7 @@ void scn_cpu_double10MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_cpu_double10SparseToDense_updateOutput( void scn_cpu_double10SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *output_features, void *rulesBuffer); THDoubleTensor *output_features, void *rulesBuffer, long nPlanes);
void scn_cpu_double10SparseToDense_updateGradInput( void scn_cpu_double10SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THDoubleTensor *input_features, THLongTensor *inputSize, void **m, THDoubleTensor *input_features,
THDoubleTensor *d_input_features, THDoubleTensor *d_output_features, THDoubleTensor *d_input_features, THDoubleTensor *d_output_features,
......
...@@ -122,7 +122,8 @@ void scn_gpu_float1MaxPooling_updateGradInput( ...@@ -122,7 +122,8 @@ void scn_gpu_float1MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float1SparseToDense_updateOutput( void scn_gpu_float1SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer,
long nPlanes);
void scn_gpu_float1SparseToDense_updateGradInput( void scn_gpu_float1SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -199,7 +200,7 @@ void scn_gpu_float2MaxPooling_updateGradInput( ...@@ -199,7 +200,7 @@ void scn_gpu_float2MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float2SparseToDense_updateOutput( void scn_gpu_float2SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float2SparseToDense_updateGradInput( void scn_gpu_float2SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -276,7 +277,7 @@ void scn_gpu_float3MaxPooling_updateGradInput( ...@@ -276,7 +277,7 @@ void scn_gpu_float3MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float3SparseToDense_updateOutput( void scn_gpu_float3SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float3SparseToDense_updateGradInput( void scn_gpu_float3SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -353,7 +354,7 @@ void scn_gpu_float4MaxPooling_updateGradInput( ...@@ -353,7 +354,7 @@ void scn_gpu_float4MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float4SparseToDense_updateOutput( void scn_gpu_float4SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float4SparseToDense_updateGradInput( void scn_gpu_float4SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -430,7 +431,7 @@ void scn_gpu_float5MaxPooling_updateGradInput( ...@@ -430,7 +431,7 @@ void scn_gpu_float5MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float5SparseToDense_updateOutput( void scn_gpu_float5SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float5SparseToDense_updateGradInput( void scn_gpu_float5SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -507,7 +508,7 @@ void scn_gpu_float6MaxPooling_updateGradInput( ...@@ -507,7 +508,7 @@ void scn_gpu_float6MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float6SparseToDense_updateOutput( void scn_gpu_float6SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float6SparseToDense_updateGradInput( void scn_gpu_float6SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -584,7 +585,7 @@ void scn_gpu_float7MaxPooling_updateGradInput( ...@@ -584,7 +585,7 @@ void scn_gpu_float7MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float7SparseToDense_updateOutput( void scn_gpu_float7SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float7SparseToDense_updateGradInput( void scn_gpu_float7SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -661,7 +662,7 @@ void scn_gpu_float8MaxPooling_updateGradInput( ...@@ -661,7 +662,7 @@ void scn_gpu_float8MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float8SparseToDense_updateOutput( void scn_gpu_float8SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float8SparseToDense_updateGradInput( void scn_gpu_float8SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -738,7 +739,7 @@ void scn_gpu_float9MaxPooling_updateGradInput( ...@@ -738,7 +739,7 @@ void scn_gpu_float9MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float9SparseToDense_updateOutput( void scn_gpu_float9SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float9SparseToDense_updateGradInput( void scn_gpu_float9SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
...@@ -815,7 +816,7 @@ void scn_gpu_float10MaxPooling_updateGradInput( ...@@ -815,7 +816,7 @@ void scn_gpu_float10MaxPooling_updateGradInput(
// SparseToDense // SparseToDense
void scn_gpu_float10SparseToDense_updateOutput( void scn_gpu_float10SparseToDense_updateOutput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *output_features, THCudaIntTensor *rulesBuffer); THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
void scn_gpu_float10SparseToDense_updateGradInput( void scn_gpu_float10SparseToDense_updateGradInput(
THLongTensor *inputSize, void **m, THCudaTensor *input_features, THLongTensor *inputSize, void **m, THCudaTensor *input_features,
THCudaTensor *d_input_features, THCudaTensor *d_output_features, THCudaTensor *d_input_features, THCudaTensor *d_output_features,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment