Commit 1df7b845 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

3d segmantation

parent f2e3800b
......@@ -16,7 +16,7 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume, THCITensor *rulesBuffer) {
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
......@@ -38,9 +38,9 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
......@@ -57,7 +57,7 @@ extern "C" void scn_DR_(Convolution_backward)(
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume, THCITensor *rulesBuffer) {
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
......@@ -90,7 +90,7 @@ extern "C" void scn_DR_(Convolution_backward)(
extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
THCTensor *bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
uInt nActive = _m.getNActive(inputSize);
......@@ -111,9 +111,9 @@ extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
......@@ -129,7 +129,7 @@ extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
THLongTensor *inputSize, THLongTensor *filterSize, void **m,
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
uInt nActive = _m.getNActive(inputSize);
......@@ -158,4 +158,156 @@ extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
}
}
extern "C" double scn_DR_(FullConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **mIn, void **mOut,
THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
auto _rules = _mIn.getFullConvolutionRuleBook(
inputSize, outputSize, filterSize, filterStride, _mOut);
uInt nActive = _mOut.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(FullConvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **mIn, void **mOut,
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
auto _rules = _mIn.getFullConvolutionRuleBook(
inputSize, outputSize, filterSize, filterStride, _mOut);
uInt nActive = _mOut.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
extern "C" double scn_DR_(RandomizedStrideConvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride,
void **m, THCTensor *input_features, THCTensor *output_features,
THCTensor *weight, THCTensor *bias, long filterVolume,
THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
double flops = 0;
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
}
return flops;
}
extern "C" void scn_DR_(RandomizedStrideConvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride,
void **m, THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
THCTensor *d_bias, long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
if (nActive) {
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
}
#endif
......@@ -16,7 +16,7 @@ extern "C" double scn_DR_(Deconvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume, THCITensor *rulesBuffer) {
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
......@@ -55,7 +55,7 @@ extern "C" void scn_DR_(Deconvolution_backward)(
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume, THCITensor *rulesBuffer) {
long filterVolume) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
......
......@@ -5,61 +5,128 @@
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/InputLayer.cu"
#define TH_GENERIC_FILE_ "generic/GPU/IOLayers.cu"
#else
#include "InputLayer.h"
#include "IOLayers.h"
extern "C" void scn_DR_(InputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THCTensor *input_features, THCTensor *output_features, long batchSize,
long mode, THCITensor *rulesBuffer) {
long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.inputLayer(spatialSize, input_coords, batchSize, mode);
uInt nPlanes = input_features->size[1];
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto &rules = _m.inputLayerRuleBook;
uInt maxActive = rules[0][1];
uInt nRows = rules[0][3];
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
} else {
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(InputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features,
THCITensor *rulesBuffer) {
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
THCTensor_(zero)(state, d_input_features);
auto mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][3];
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
} else {
THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
THCTensor *input_features,
THCTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = input_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
} else {
THCTensor_(resize2d)(state, output_features, rules[0][2], nPlanes);
THCTensor_(zero)(state, output_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(OutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.inputLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
} else {
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
THCTensor_(zero)(state, d_input_features);
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(BLInputLayer_updateOutput)(
void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
THCTensor *input_features, THCTensor *output_features, long mode,
THCITensor *rulesBuffer) {
THCTensor *input_features, THCTensor *output_features, long mode) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.blLayer(spatialSize, input_coords, mode);
uInt nPlanes = input_features->size[2];
......@@ -74,21 +141,22 @@ extern "C" void scn_DR_(BLInputLayer_updateOutput)(
THCTensor_(copy)(state, output_features, input_features);
THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
} else {
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features,
THCITensor *rulesBuffer) {
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = d_output_features->size[1];
......@@ -99,76 +167,84 @@ scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3], nPlanes);
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
nPlanes);
} else {
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3], nPlanes);
THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
nPlanes);
THCTensor_(zero)(state, d_input_features);
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void scn_DR_(BLOutputLayer_updateOutput)(
void **m,
THCTensor *input_features, THCTensor *output_features,
THCITensor *rulesBuffer) {
extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
THCTensor *input_features,
THCTensor *output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = input_features->size[1];
auto mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode==0) {
if (mode == 0) {
THCTensor_(resizeAs)(state, output_features, input_features);
THCTensor_(copy)(state, output_features, input_features);
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3], nPlanes);
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
nPlanes);
} else {
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3], nPlanes);
THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
nPlanes);
THCTensor_(zero)(state, output_features);
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
extern "C" void
scn_DR_(BLOutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
THCTensor *d_output_features,
THCITensor *rulesBuffer) {
THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &rules = _m.blLayerRuleBook;
uInt nPlanes = d_output_features->size[2];
uInt mode = rules[0][0];
uInt maxActive = rules[0][1];
uInt nRows = rules[0][4];
if (mode==0) {
if (mode == 0) {
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
THCTensor_(copy)(state, d_input_features, d_output_features);
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
} else {
THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
THCTensor_(zero)(state, d_input_features);
THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto rulesBuffer = THCITensor_(new)(state);
THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
THCState_getCurrentStream(state)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
THCITensor_(free)(state, rulesBuffer);
}
}
#endif
......@@ -4,8 +4,8 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_INPUTLAYER_H
#define GPU_INPUTLAYER_H
#ifndef GPU_IOLAYERS_H
#define GPU_IOLAYERS_H
template <typename T>
__global__ void InputLayer_fp(T *input_features, T *output_features,
......@@ -40,4 +40,4 @@ __global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
}
}
}
#endif /* GPU_INPUTLAYER_H */
#endif /* GPU_IOLAYERS_H */
......@@ -13,7 +13,7 @@
extern "C" void scn_DR_(MaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
......@@ -35,8 +35,7 @@ extern "C" void scn_DR_(MaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *output_features,
THCTensor *d_output_features, long nFeaturesToDrop,
THCITensor *rulesBuffer) {
THCTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
......@@ -56,4 +55,49 @@ extern "C" void scn_DR_(MaxPooling_updateGradInput)(
d_output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(
MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
nPlanes, input_features->size[1],
output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *output_features,
THCTensor *d_output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(
MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
oF, doF, nPlanes, input_features->size[1],
d_output_features->size[1], rbB, nHotB);
, )
}
#endif
......@@ -14,9 +14,10 @@
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
if (THCITensor_nElement(state, rulesBuffer) < ms) \
THCITensor_resize1d(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer); \
auto rulesBuffer = THCITensor_(new)(state); \
if (THCITensor_(nElement)(state, rulesBuffer) < ms) \
THCITensor_(resize1d)(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
......@@ -28,6 +29,7 @@
X \
} \
Y \
}
} \
THCITensor_(free)(state, rulesBuffer);
#endif /* GPU_RULEBOOKITERATOR_H */
......@@ -11,7 +11,7 @@
extern "C" void scn_DR_(SparseToDense_updateOutput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *output_features, THCITensor *rulesBuffer, long nPlanes) {
THCTensor *output_features, long nPlanes) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
{
......@@ -36,8 +36,7 @@ extern "C" void scn_DR_(SparseToDense_updateOutput)(
}
extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCITensor *rulesBuffer) {
THCTensor *d_input_features, THCTensor *d_output_features) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
THCTensor_(resizeAs)(state, d_input_features, input_features);
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/UnPooling.cu"
#else
#include "RuleBookIterator.h"
#include "UnPooling.h"
extern "C" void scn_DR_(UnPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive,
input_features->size[1] - nFeaturesToDrop);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(UnPooling_ForwardPass<real>(
THCState_getCurrentStream(state), iF, oF, nPlanes,
input_features->size[1], output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
extern "C" void scn_DR_(UnPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
long nFeaturesToDrop) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(UnPooling_BackwardPass<real>(
THCState_getCurrentStream(state), diF, doF, nPlanes,
input_features->size[1], d_output_features->size[1], rbB,
nHotB, _rules.size());
, )
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_UNPOOLING_H
#define GPU_UNPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
__global__ void UnPooling_fp(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y + 1] * input_stride;
uInt o = r[2 * threadIdx.y] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
output_features[o + plane]+=input_features[i + plane];
}
__syncthreads();
}
}
template <typename T>
void UnPooling_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
UnPooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
template <typename T, uInt NTX, uInt NTY>
__global__ void UnPooling_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y + 1] * input_stride;
uInt o = r[2 * threadIdx.y] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void UnPooling_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes,
uInt input_stride, uInt output_stride,
uInt *rules, uInt nHot, uInt filterVolume) {
UnPooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
d_input_features, d_output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
#endif /* GPU_UNPOOLING_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment