Commit d27491d4 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

group convolutions

parent 8422a6f5
...@@ -24,9 +24,9 @@ Higher dimensional input is more likely to be sparse because of the 'curse of di ...@@ -24,9 +24,9 @@ Higher dimensional input is more likely to be sparse because of the 'curse of di
Dimension|Name in 'torch.nn'|Use cases Dimension|Name in 'torch.nn'|Use cases
:--:|:--:|:--: :--:|:--:|:--:
1|TemporalConvolution| Text, audio 1|Conv1d| Text, audio
2|SpatialConvolution|Lines in 2D space, e.g. handwriting 2|Conv2d|Lines in 2D space, e.g. handwriting
3|VolumetricConvolution|Lines and surfaces in 3D space or (2+1)D space-time 3|Conv3d|Lines and surfaces in 3D space or (2+1)D space-time
4| - |Lines, etc, in (3+1)D space-time 4| - |Lines, etc, in (3+1)D space-time
We use the term 'submanifold' to refer to input data that is sparse because it has a lower effective dimension than the space in which it lives, for example a one-dimensional curve in 2+ dimensional space, or a two-dimensional surface in 3+ dimensional space. We use the term 'submanifold' to refer to input data that is sparse because it has a lower effective dimension than the space in which it lives, for example a one-dimensional curve in 2+ dimensional space, or a two-dimensional surface in 3+ dimensional space.
...@@ -137,7 +137,7 @@ apt-get install unrar ...@@ -137,7 +137,7 @@ apt-get install unrar
``` ```
## License ## License
SparseConvNet is Attribution-NonCommercial 4.0 International licensed, as found in the LICENSE file. SparseConvNet is BSD licensed, as found in the LICENSE file.
## Links ## Links
1. [ICDAR 2013 Chinese Handwriting Recognition Competition 2013](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/Home.html) First place in task 3, with test error of 2.61%. Human performance on the test set was 4.81%. [Report](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/ICDAR%202013%20CHR%20competition.pdf) 1. [ICDAR 2013 Chinese Handwriting Recognition Competition 2013](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/Home.html) First place in task 3, with test error of 2.61%. Human performance on the test set was 4.81%. [Report](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/ICDAR%202013%20CHR%20competition.pdf)
......
This diff is collapsed.
...@@ -15,29 +15,26 @@ double cpu_Deconvolution_updateOutput( ...@@ -15,29 +15,26 @@ double cpu_Deconvolution_updateOutput(
auto _rules = auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true); m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize); Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)}); output_features.resize_({nActive, weight.size(1) * weight.size(3)});
if (bias.numel() and nActive) if (bias.numel() and nActive)
output_features.copy_(bias); output_features.copy_(bias);
else else
output_features.zero_(); output_features.zero_();
double flops = 0; double flops = 0;
auto ip = weight.size(1); auto groups = weight.size(1);
auto op = weight.size(2); auto ip = weight.size(2);
for (Int i = 0; i < (Int)_rules.size(); i++) { auto op = weight.size(3);
for (Int i = 0; i < (Int)_rules.size(); ++i) {
auto r = _rules[i]; auto r = _rules[i];
int nRules = r.size() / 2; Int nRules = r.size() / 2;
if (nRules) { if (nRules) {
flops += nRules * ip * op; flops += nRules * ip * op * groups;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 0), output_rows);
auto input_rows = rule_index_select<T>(input_features, nRules, &r[1]);
auto w = weight.select(0, i); auto w = weight.select(0, i);
auto output_rows = at::mm(input_rows, w); auto input_rows =
rule_index_add_<T>(output_features, output_rows, nRules, &r[0]); rule_index_select<T>(input_features, nRules, &r[1], groups);
auto output_rows = at::matmul(input_rows, w);
rule_index_add_<T>(output_features, output_rows, nRules, &r[0], groups);
} }
} }
return flops; return flops;
...@@ -59,26 +56,22 @@ void cpu_Deconvolution_backward( ...@@ -59,26 +56,22 @@ void cpu_Deconvolution_backward(
d_input_features.resize_as_(input_features); d_input_features.resize_as_(input_features);
d_input_features.zero_(); d_input_features.zero_();
auto groups = weight.size(1);
if (nActive and d_bias.numel()) if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false); at::sum_out(d_bias, d_output_features, {0}, false);
for (Int i = 0; i < (Int)_rules.size(); i++) { for (Int i = 0; i < (Int)_rules.size(); ++i) {
auto r = _rules[i]; auto r = _rules[i];
int nRules = r.size() / 2; Int nRules = r.size() / 2;
if (nRules) { if (nRules) {
auto w = weight.select(0, i); auto w = weight.select(0, i);
auto dw = d_weight.select(0, i); auto dw = d_weight.select(0, i);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2}); auto input_rows =
// auto input_rows = input_features.index_select(0, rt.select(1, 1)); rule_index_select<T>(input_features, nRules, &r[1], groups);
// auto d_output_rows = d_output_features.index_select(0, rt.select(1, auto d_output_rows =
// 0)); rule_index_select<T>(d_output_features, nRules, &r[0], groups);
// at::mm_out(dw, input_rows.t(), d_output_rows); at::matmul_out(dw, input_rows.transpose(1, 2), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t()); auto d_input_rows = at::matmul(d_output_rows, w.transpose(1, 2));
// d_input_features.index_add_(0, rt.select(1, 1), d_input_rows); rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1], groups);
auto input_rows = rule_index_select<T>(input_features, nRules, &r[1]);
auto d_output_rows = rule_index_select<T>(d_output_features, nRules, &r[0]);
at::mm_out(dw, input_rows.t(), d_output_rows);
auto d_input_rows = at::mm(d_output_rows, w.t());
rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1]);
} }
} }
} }
...@@ -12,13 +12,13 @@ template <typename T> ...@@ -12,13 +12,13 @@ template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride); Int output_stride, Int nGroups);
template <typename T> template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes, T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride); Int output_stride, Int nGroups);
template <typename T, Int Dimension> template <typename T, Int Dimension>
double cuda_Convolution_updateOutput( double cuda_Convolution_updateOutput(
...@@ -32,9 +32,10 @@ double cuda_Convolution_updateOutput( ...@@ -32,9 +32,10 @@ double cuda_Convolution_updateOutput(
auto _rules = auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true); m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActiveOut, op}); Int op = weight.size(3);
output_features.resize_({nActiveOut, op * nGroups});
if (nActiveOut) { if (nActiveOut) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -46,7 +47,8 @@ double cuda_Convolution_updateOutput( ...@@ -46,7 +47,8 @@ double cuda_Convolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -67,9 +69,10 @@ void cuda_Convolution_backward( ...@@ -67,9 +69,10 @@ void cuda_Convolution_backward(
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true); m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize); Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActiveIn, ip}); Int op = weight.size(3);
d_input_features.resize_({nActiveIn, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActiveOut) { if (nActiveOut) {
...@@ -79,7 +82,8 @@ void cuda_Convolution_backward( ...@@ -79,7 +82,8 @@ void cuda_Convolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
...@@ -98,9 +102,10 @@ double cuda_SubmanifoldConvolution_updateOutput( ...@@ -98,9 +102,10 @@ double cuda_SubmanifoldConvolution_updateOutput(
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true); auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize); Int nActive = m.getNActive(inputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActive, op}); Int op = weight.size(3);
output_features.resize_({nActive, op * nGroups});
if (nActive) { if (nActive) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -112,7 +117,8 @@ double cuda_SubmanifoldConvolution_updateOutput( ...@@ -112,7 +117,8 @@ double cuda_SubmanifoldConvolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -130,9 +136,10 @@ void cuda_SubmanifoldConvolution_backward( ...@@ -130,9 +136,10 @@ void cuda_SubmanifoldConvolution_backward(
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true); auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize); Int nActive = m.getNActive(inputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActive, ip}); Int op = weight.size(3);
d_input_features.resize_({nActive, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActive) { if (nActive) {
...@@ -142,7 +149,8 @@ void cuda_SubmanifoldConvolution_backward( ...@@ -142,7 +149,8 @@ void cuda_SubmanifoldConvolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
...@@ -160,9 +168,10 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput( ...@@ -160,9 +168,10 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput(
auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true); auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
Int nActive = m.getNActive(inputSize); Int nActive = m.getNActive(inputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActive, op}); Int op = weight.size(3);
output_features.resize_({nActive, op * nGroups});
if (nActive) { if (nActive) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -174,7 +183,8 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput( ...@@ -174,7 +183,8 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -191,9 +201,10 @@ void cuda_PermutohedralSubmanifoldConvolution_backward( ...@@ -191,9 +201,10 @@ void cuda_PermutohedralSubmanifoldConvolution_backward(
auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true); auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
Int nActive = m.getNActive(inputSize); Int nActive = m.getNActive(inputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActive, ip}); Int op = weight.size(3);
d_input_features.resize_({nActive, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActive) { if (nActive) {
...@@ -203,7 +214,8 @@ void cuda_PermutohedralSubmanifoldConvolution_backward( ...@@ -203,7 +214,8 @@ void cuda_PermutohedralSubmanifoldConvolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
...@@ -225,9 +237,10 @@ double cuda_FullConvolution_updateOutput( ...@@ -225,9 +237,10 @@ double cuda_FullConvolution_updateOutput(
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize, auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut); filterSize, filterStride, mOut);
Int nActiveOut = mOut.getNActive(outputSize); Int nActiveOut = mOut.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActiveOut, op}); Int op = weight.size(3);
output_features.resize_({nActiveOut, op * nGroups});
if (nActiveOut) { if (nActiveOut) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -239,7 +252,8 @@ double cuda_FullConvolution_updateOutput( ...@@ -239,7 +252,8 @@ double cuda_FullConvolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -261,9 +275,10 @@ void cuda_FullConvolution_backward( ...@@ -261,9 +275,10 @@ void cuda_FullConvolution_backward(
filterSize, filterStride, mOut); filterSize, filterStride, mOut);
Int nActiveIn = mIn.getNActive(inputSize); Int nActiveIn = mIn.getNActive(inputSize);
Int nActiveOut = mOut.getNActive(outputSize); Int nActiveOut = mOut.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActiveIn, ip}); Int op = weight.size(3);
d_input_features.resize_({nActiveIn, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActiveOut) { if (nActiveOut) {
...@@ -273,7 +288,8 @@ void cuda_FullConvolution_backward( ...@@ -273,7 +288,8 @@ void cuda_FullConvolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
...@@ -293,9 +309,10 @@ double cuda_RandomizedStrideConvolution_updateOutput( ...@@ -293,9 +309,10 @@ double cuda_RandomizedStrideConvolution_updateOutput(
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true); filterStride, true);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActiveOut, op}); Int op = weight.size(3);
output_features.resize_({nActiveOut, op * nGroups});
if (nActiveOut) { if (nActiveOut) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -307,7 +324,8 @@ double cuda_RandomizedStrideConvolution_updateOutput( ...@@ -307,7 +324,8 @@ double cuda_RandomizedStrideConvolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -328,9 +346,10 @@ void cuda_RandomizedStrideConvolution_backward( ...@@ -328,9 +346,10 @@ void cuda_RandomizedStrideConvolution_backward(
filterStride, true); filterStride, true);
Int nActiveIn = m.getNActive(inputSize); Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActiveIn, ip}); Int op = weight.size(3);
d_input_features.resize_({nActiveIn, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActiveOut) { if (nActiveOut) {
...@@ -340,7 +359,8 @@ void cuda_RandomizedStrideConvolution_backward( ...@@ -340,7 +359,8 @@ void cuda_RandomizedStrideConvolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
......
...@@ -49,6 +49,11 @@ void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) { ...@@ -49,6 +49,11 @@ void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
} }
} }
// .._nPlanes == planes per nGroup
// weight = nGroups x input_nPlanes x output_nPlanes
// = nGroups x M*K x N*K
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...@@ -57,7 +62,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -57,7 +62,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
// nHot must be a multiple of K!! // nHot must be a multiple of K!!
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
...@@ -65,8 +70,10 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -65,8 +70,10 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = input_nPlanes / K; Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K // N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -125,7 +132,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -125,7 +132,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
...@@ -133,8 +140,10 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -133,8 +140,10 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = input_nPlanes / K; Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K // N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -199,13 +208,13 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -199,13 +208,13 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dConvolution_KMxKN_forwardA< \ dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K, nGroups), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \ dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \ input_nPlanes, input_stride, \
output_nPlanes, output_stride); \ output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_forwardB< \ dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V )>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \ inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \ return; \
...@@ -215,7 +224,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -215,7 +224,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T> template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(T, 64, 16) FOO(T, 64, 16)
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
...@@ -226,7 +235,7 @@ template <> ...@@ -226,7 +235,7 @@ template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures, void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot, double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(double, 32, 8) FOO(double, 32, 8)
FOO(double, 16, 4) FOO(double, 16, 4)
FOO(double, 8, 2) FOO(double, 8, 2)
...@@ -236,7 +245,7 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures, ...@@ -236,7 +245,7 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...@@ -246,10 +255,12 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -246,10 +255,12 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
TACC dI[V]; TACC dI[V];
TACC dW[V]; TACC dW[V];
...@@ -313,7 +324,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -313,7 +324,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...@@ -323,10 +334,12 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -323,10 +334,12 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
TACC dI[V]; TACC dI[V];
TACC dW[V]; TACC dW[V];
...@@ -402,13 +415,13 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -402,13 +415,13 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \ dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K, nGroups), \
dim3(K, K / V)>>>( \ dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \ dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \ nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \ output_stride); \
...@@ -420,7 +433,7 @@ template <typename T> ...@@ -420,7 +433,7 @@ template <typename T>
void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
FOO(T, 8, 2) FOO(T, 8, 2)
...@@ -434,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -434,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
...@@ -443,8 +456,10 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -443,8 +456,10 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = (input_nPlanes + K - 1) / K; Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K // N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
Int KO = min(K, output_nPlanes - K * n); Int KO = min(K, output_nPlanes - K * n);
TACC O[V]; TACC O[V];
...@@ -507,7 +522,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -507,7 +522,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...@@ -517,10 +532,12 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -517,10 +532,12 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K; Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m); Int KI = min(K, input_nPlanes - K * m);
TACC dI[V]; TACC dI[V];
...@@ -602,8 +619,8 @@ template <typename T> ...@@ -602,8 +619,8 @@ template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride, Int nGroups) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes * nGroups;
double flops = 0; double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
...@@ -611,14 +628,14 @@ double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, ...@@ -611,14 +628,14 @@ double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dConvolution_KMxKN_forward2< (dConvolution_KMxKN_forward2<
T, K, T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride, inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride)); output_nPlanes, output_stride));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} else { } else {
RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB, RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride, nGroups);
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} }
return flops; return flops;
...@@ -628,15 +645,15 @@ template <typename T> ...@@ -628,15 +645,15 @@ template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes, T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride, Int nGroups) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dConvolution_KMxKN_backward_dW2< (dConvolution_KMxKN_backward_dW2<
T, K, T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB, inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride)); input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;) , w += c; dw += c;)
...@@ -644,7 +661,7 @@ void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -644,7 +661,7 @@ void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures, RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride, nGroups);
, w += c; dw += c;) , w += c; dw += c;)
} }
} }
......
...@@ -8,13 +8,14 @@ template <typename T> ...@@ -8,13 +8,14 @@ template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride); Int output_stride, Int nGroups);
template <typename T> template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride); Int output_nPlanes, Int output_stride,
Int nGroups);
template <typename T, Int Dimension> template <typename T, Int Dimension>
double cuda_Deconvolution_updateOutput( double cuda_Deconvolution_updateOutput(
...@@ -28,9 +29,10 @@ double cuda_Deconvolution_updateOutput( ...@@ -28,9 +29,10 @@ double cuda_Deconvolution_updateOutput(
auto _rules = auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true); m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
output_features.resize_({nActiveOut, op}); Int op = weight.size(3);
output_features.resize_({nActiveOut, op * nGroups});
if (nActiveOut) { if (nActiveOut) {
auto iF = input_features.data<T>(); auto iF = input_features.data<T>();
...@@ -42,7 +44,8 @@ double cuda_Deconvolution_updateOutput( ...@@ -42,7 +44,8 @@ double cuda_Deconvolution_updateOutput(
else else
output_features.zero_(); output_features.zero_();
return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op); return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
op * nGroups, nGroups);
} else { } else {
return 0; return 0;
} }
...@@ -63,9 +66,10 @@ void cuda_Deconvolution_backward( ...@@ -63,9 +66,10 @@ void cuda_Deconvolution_backward(
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true); m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize); Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize); Int nActiveOut = m.getNActive(outputSize);
Int ip = weight.size(1); Int nGroups = weight.size(1);
Int op = weight.size(2); Int ip = weight.size(2);
d_input_features.resize_({nActiveIn, ip}); Int op = weight.size(3);
d_input_features.resize_({nActiveIn, ip * nGroups});
d_input_features.zero_(); d_input_features.zero_();
if (nActiveOut) { if (nActiveOut) {
...@@ -75,7 +79,8 @@ void cuda_Deconvolution_backward( ...@@ -75,7 +79,8 @@ void cuda_Deconvolution_backward(
auto w = weight.data<T>(); auto w = weight.data<T>();
auto dw = d_weight.data<T>(); auto dw = d_weight.data<T>();
dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip,
ip * nGroups, op, op * nGroups, nGroups);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, nActiveOut); Convolution_bp_bias(doF, db, op, nActiveOut);
......
...@@ -14,7 +14,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -14,7 +14,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
// nHot must be a multiple of K!! // nHot must be a multiple of K!!
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
...@@ -22,8 +22,10 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -22,8 +22,10 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = input_nPlanes / K; Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K // N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -82,7 +84,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -82,7 +84,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
...@@ -90,8 +92,10 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -90,8 +92,10 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = input_nPlanes / K; Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K // N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -156,13 +160,13 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -156,13 +160,13 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dDeconvolution_KMxKN_forwardA< \ dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K, nGroups), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \ dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \ input_nPlanes, input_stride, \
output_nPlanes, output_stride); \ output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \ dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \ inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \ return; \
...@@ -172,7 +176,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -172,7 +176,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T> template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(T, 64, 16) FOO(T, 64, 16)
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
...@@ -183,7 +187,7 @@ template <> ...@@ -183,7 +187,7 @@ template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures, void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot, double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(double, 32, 8) FOO(double, 32, 8)
FOO(double, 16, 4) FOO(double, 16, 4)
FOO(double, 8, 2) FOO(double, 8, 2)
...@@ -193,7 +197,7 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures, ...@@ -193,7 +197,7 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_A( __global__ void dDeconvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules, T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
...@@ -202,10 +206,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A( ...@@ -202,10 +206,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
TACC dI[V]; TACC dI[V];
TACC dW[V]; TACC dW[V];
...@@ -269,7 +275,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A( ...@@ -269,7 +275,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_B( __global__ void dDeconvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules, T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
...@@ -278,10 +284,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -278,10 +284,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
TACC dI[V]; TACC dI[V];
TACC dW[V]; TACC dW[V];
...@@ -357,13 +365,13 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -357,13 +365,13 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \ dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K, nGroups), \
dim3(K, K / V)>>>( \ dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \ dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \ nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \ output_stride); \
...@@ -375,7 +383,7 @@ template <typename T> ...@@ -375,7 +383,7 @@ template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
FOO(T, 8, 2) FOO(T, 8, 2)
...@@ -389,7 +397,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -389,7 +397,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
...@@ -398,8 +406,10 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -398,8 +406,10 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int M = (input_nPlanes + K - 1) / K; Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K // N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y; Int n = blockIdx.y;
outFeatures += n * K; Int g = blockIdx.z;
w += n * K; inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
Int KO = min(K, output_nPlanes - K * n); Int KO = min(K, output_nPlanes - K * n);
TACC O[V]; TACC O[V];
...@@ -462,7 +472,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -462,7 +472,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
// dOutput x W^T -> dInput and // dOutput x W^T -> dInput and
// Input^T x dOutput -> dW // Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1) // blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
...@@ -472,10 +482,12 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, ...@@ -472,10 +482,12 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K; Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y; Int m = blockIdx.y;
inFeatures += m * K; Int g = blockIdx.z;
dInFeatures += m * K; inFeatures += m * K + g * input_nPlanes;
w += m * K * output_nPlanes; dInFeatures += m * K + g * input_nPlanes;
dw += m * K * output_nPlanes; dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m); Int KI = min(K, input_nPlanes - K * m);
TACC dI[V]; TACC dI[V];
...@@ -557,8 +569,8 @@ template <typename T> ...@@ -557,8 +569,8 @@ template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride, Int nGroups) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes * nGroups;
double flops = 0; double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
...@@ -566,14 +578,14 @@ double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, ...@@ -566,14 +578,14 @@ double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dDeconvolution_KMxKN_forward2< (dDeconvolution_KMxKN_forward2<
T, K, T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride, inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride)); output_nPlanes, output_stride));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} else { } else {
RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB, RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride, nGroups);
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} }
return flops; return flops;
...@@ -583,15 +595,15 @@ template <typename T> ...@@ -583,15 +595,15 @@ template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride, Int nGroups) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dDeconvolution_KMxKN_backward_dW2< (dDeconvolution_KMxKN_backward_dW2<
T, K, T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB, inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride)); input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;) , w += c; dw += c;)
...@@ -599,9 +611,8 @@ void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -599,9 +611,8 @@ void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures, RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride, nGroups);
, w += c; dw += c;) , w += c; dw += c;)
} }
} }
#undef TACC #undef TACC
\ No newline at end of file
...@@ -55,21 +55,21 @@ template void Convolution_bp_bias<float>(float *d_oF, float *d_b, ...@@ -55,21 +55,21 @@ template void Convolution_bp_bias<float>(float *d_oF, float *d_b,
Int nPlanes, Int nActive); Int nPlanes, Int nActive);
template double dConvolution_forward2<float>( template double dConvolution_forward2<float>(
float *inFeatures, float *outFeatures, float *w, RuleBook _rules, float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride); Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride, Int nGroups);
template void dConvolution_backward_dW2<float>( template void dConvolution_backward_dW2<float>(
float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w, float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w,
float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride, float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride); Int output_nPlanes, Int output_stride, Int nGroups);
template double dDeconvolution_forward2<float>( template double dDeconvolution_forward2<float>(
float *inFeatures, float *outFeatures, float *w, RuleBook _rules, float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride); Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride, Int nGroups);
template void dDeconvolution_backward_dW2<float>( template void dDeconvolution_backward_dW2<float>(
float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w, float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w,
float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride, float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride); Int output_nPlanes, Int output_stride, Int nGroups);
template void InputLayer_fp<float>(float *input_features, template void InputLayer_fp<float>(float *input_features,
float *output_features, Int nRows, float *output_features, Int nRows,
......
...@@ -25,10 +25,10 @@ from .networkInNetwork import NetworkInNetwork ...@@ -25,10 +25,10 @@ from .networkInNetwork import NetworkInNetwork
from .permutohedralSubmanifoldConvolution import PermutohedralSubmanifoldConvolution, permutohedral_basis from .permutohedralSubmanifoldConvolution import PermutohedralSubmanifoldConvolution, permutohedral_basis
from .randomizedStrideConvolution import RandomizedStrideConvolution from .randomizedStrideConvolution import RandomizedStrideConvolution
from .randomizedStrideMaxPooling import RandomizedStrideMaxPooling from .randomizedStrideMaxPooling import RandomizedStrideMaxPooling
from .sequential import Sequential from .sequential import Sequential, CheckpointedSequential
from .sparseConvNetTensor import SparseConvNetTensor from .sparseConvNetTensor import SparseConvNetTensor
from .sparseToDense import SparseToDense from .sparseToDense import SparseToDense
from .sparsify import Sparsify from .sparsify import Sparsify, SparsifyFCS
from .spectral_norm import spectral_norm from .spectral_norm import spectral_norm
from .submanifoldConvolution import SubmanifoldConvolution, ValidConvolution from .submanifoldConvolution import SubmanifoldConvolution, ValidConvolution
from .tables import * from .tables import *
......
...@@ -41,7 +41,9 @@ class BatchNormalization(Module): ...@@ -41,7 +41,9 @@ class BatchNormalization(Module):
self.bias = Parameter(torch.Tensor(nPlanes).fill_(0)) self.bias = Parameter(torch.Tensor(nPlanes).fill_(0))
def forward(self, input): def forward(self, input):
assert input.features.nelement() == 0 or input.features.size(1) == self.nPlanes, (self.nPlanes, input.features.shape) if input.features.nelement() == 0:
return input
assert input.features.size(1) == self.nPlanes, (self.nPlanes, input.features.shape)
output = SparseConvNetTensor() output = SparseConvNetTensor()
output.metadata = input.metadata output.metadata = input.metadata
output.spatial_size = input.spatial_size output.spatial_size = input.spatial_size
......
...@@ -11,17 +11,18 @@ from .utils import * ...@@ -11,17 +11,18 @@ from .utils import *
from .sparseConvNetTensor import SparseConvNetTensor from .sparseConvNetTensor import SparseConvNetTensor
class Convolution(Module): class Convolution(Module):
def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias): def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
Module.__init__(self) Module.__init__(self)
self.dimension = dimension self.dimension = dimension
self.groups = groups
self.nIn = nIn self.nIn = nIn
self.nOut = nOut self.nOut = nOut
self.filter_size = toLongTensor(dimension, filter_size) self.filter_size = toLongTensor(dimension, filter_size)
self.filter_volume = self.filter_size.prod().item() self.filter_volume = self.filter_size.prod().item()
self.filter_stride = toLongTensor(dimension, filter_stride) self.filter_stride = toLongTensor(dimension, filter_stride)
std = (2.0 / nIn / self.filter_volume)**0.5 std = (2.0 * groups / nIn / self.filter_volume)**0.5
self.weight = Parameter(torch.Tensor( self.weight = Parameter(torch.Tensor(
self.filter_volume, nIn, nOut).normal_( self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
0, 0,
std)) std))
if bias: if bias:
......
...@@ -11,17 +11,18 @@ from .utils import * ...@@ -11,17 +11,18 @@ from .utils import *
from .sparseConvNetTensor import SparseConvNetTensor from .sparseConvNetTensor import SparseConvNetTensor
class Deconvolution(Module): class Deconvolution(Module):
def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias): def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
Module.__init__(self) Module.__init__(self)
self.dimension = dimension self.dimension = dimension
self.groups = groups
self.nIn = nIn self.nIn = nIn
self.nOut = nOut self.nOut = nOut
self.filter_size = toLongTensor(dimension, filter_size) self.filter_size = toLongTensor(dimension, filter_size)
self.filter_volume = self.filter_size.prod().item() self.filter_volume = self.filter_size.prod().item()
self.filter_stride = toLongTensor(dimension, filter_stride) self.filter_stride = toLongTensor(dimension, filter_stride)
std = (2.0 / nIn / self.filter_volume)**0.5 std = (2.0 * groups / nIn / self.filter_volume)**0.5
self.weight = Parameter(torch.Tensor( self.weight = Parameter(torch.Tensor(
self.filter_volume, nIn, nOut).normal_( self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
0, 0,
std)) std))
if bias: if bias:
......
...@@ -12,17 +12,18 @@ from .sparseConvNetTensor import SparseConvNetTensor ...@@ -12,17 +12,18 @@ from .sparseConvNetTensor import SparseConvNetTensor
from .metadata import Metadata from .metadata import Metadata
class FullConvolution(Module): class FullConvolution(Module):
def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias): def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
Module.__init__(self) Module.__init__(self)
self.dimension = dimension self.dimension = dimension
self.groups = groups
self.nIn = nIn self.nIn = nIn
self.nOut = nOut self.nOut = nOut
self.filter_size = toLongTensor(dimension, filter_size) self.filter_size = toLongTensor(dimension, filter_size)
self.filter_volume = self.filter_size.prod().item() self.filter_volume = self.filter_size.prod().item()
self.filter_stride = toLongTensor(dimension, filter_stride) self.filter_stride = toLongTensor(dimension, filter_stride)
std = (2.0 / nIn / self.filter_volume)**0.5 std = (2.0 * groups / nIn / self.filter_volume)**0.5
self.weight = Parameter(torch.Tensor( self.weight = Parameter(torch.Tensor(
self.filter_volume, nIn, nOut).normal_( self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
0, 0,
std)) std))
if bias: if bias:
...@@ -68,16 +69,16 @@ class FullConvolution(Module): ...@@ -68,16 +69,16 @@ class FullConvolution(Module):
def __repr__(self): def __repr__(self):
s = 'FullConvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C' s = 'FullConvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
if self.filter_size.max() == self.filter_size.min() and\ if self.filter_size.max().item() == self.filter_size.min().item() and\
self.filter_stride.max() == self.filter_stride.min(): self.filter_stride.max().item() == self.filter_stride.min().item():
s = s + str(self.filter_size[0]) + '/' + str(self.filter_stride[0]) s = s + str(self.filter_size[0].item()) + '/' + str(self.filter_stride[0].item())
else: else:
s = s + '(' + str(self.filter_size[0]) s = s + '(' + str(self.filter_size[0].item())
for i in self.filter_size[1:]: for i in self.filter_size[1:]:
s = s + ',' + str(i) s = s + ',' + str(i.item())
s = s + ')/(' + str(self.filter_stride[0]) s = s + ')/(' + str(self.filter_stride[0].item())
for i in self.filter_stride[1:]: for i in self.filter_stride[1:]:
s = s + ',' + str(i) s = s + ',' + str(i.item())
s = s + ')' s = s + ')'
return s return s
......
...@@ -310,3 +310,42 @@ def FullyConvolutionalNet(dimension, reps, nPlanes, residual_blocks=False, downs ...@@ -310,3 +310,42 @@ def FullyConvolutionalNet(dimension, reps, nPlanes, residual_blocks=False, downs
return m return m
m = U(nPlanes) m = U(nPlanes)
return m return m
def FullConvolutionalNetIntegratedLinear(dimension, reps, nPlanes, nClasses=-1, residual=False, downsample=[2,2], leakiness=0):
if nClasses==-1:
nClasses=reps[0]
def l(x):
return x+nPlanes
def foo(m,np):
for _ in range(reps):
if residual_blocks: #ResNet style blocks
m.add(scn.ConcatTable()
.add(scn.Identity())
.add(scn.Sequential()
.add(scn.BatchNormLeakyReLU(np,leakiness=leakiness))
.add(scn.SubmanifoldConvolution(dimension, np, np, 3, False))
.add(scn.BatchNormLeakyReLU(np,leakiness=leakiness))
.add(scn.SubmanifoldConvolution(dimension, np, np, 3, False)))
).add(scn.AddTable())
else: #VGG style blocks
m.add(scn.BatchNormLeakyReLU(np,leakiness=leakiness)
).add(scn.SubmanifoldConvolution(dimension, np, np, 3, False))
def bar(m,nPlanes,bias):
m.add(scn.BatchNormLeakyReLU(nPlanes,leakiness=leakiness))
m.add(scn.NetworkInNetwork(nPlanes,nClasses,bias)) #accumulte softmax input, only one set of biases
def baz(depth,nPlanes):
m=scn.Sequential()
foo(m,nPlanes[0])
if len(nPlanes)==1:
bar(m,nPlanes[0],True)
else:
a=scn.Sequential()
bar(a,nPlanes,False)
b=scn.Sequential(
scn.BatchNormLeakyReLU(nPlanes,leakiness=leakiness),
scn.Convolution(dimension, nPlanes[0], nPlanes[1], downsample[0], downsample[1], False),
baz(nPlanes[1:]),
scn.UnPooling(dimension, downsample[0], downsample[1]))
m.add(ConcatTable(a,b))
m.add(scn.AddTable())
return baz(depth,nPlanes)
...@@ -22,17 +22,18 @@ class RandomizedStrideConvolution(Module): ...@@ -22,17 +22,18 @@ class RandomizedStrideConvolution(Module):
sparseconvnet.Deconvolution module in an UNet style network, to restore sparseconvnet.Deconvolution module in an UNet style network, to restore
the input sparsity pattern. the input sparsity pattern.
""" """
def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias): def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
Module.__init__(self) Module.__init__(self)
self.dimension = dimension self.dimension = dimension
self.groups = groups
self.nIn = nIn self.nIn = nIn
self.nOut = nOut self.nOut = nOut
self.filter_size = toLongTensor(dimension, filter_size) self.filter_size = toLongTensor(dimension, filter_size)
self.filter_volume = self.filter_size.prod().item() self.filter_volume = self.filter_size.prod().item()
self.filter_stride = toLongTensor(dimension, filter_stride) self.filter_stride = toLongTensor(dimension, filter_stride)
std = (2.0 / nIn / self.filter_volume)**0.5 std = (2.0 * groups / nIn / self.filter_volume)**0.5
self.weight = Parameter(torch.Tensor( self.weight = Parameter(torch.Tensor(
self.filter_volume, nIn, nOut).normal_( self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
0, 0,
std)) std))
if bias: if bias:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# This source code is licensed under the BSD-style license found in the # This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
import torch import torch, torch.utils.checkpoint
class Sequential(torch.nn.Sequential): class Sequential(torch.nn.Sequential):
def input_spatial_size(self, out_size): def input_spatial_size(self, out_size):
...@@ -16,6 +16,15 @@ class Sequential(torch.nn.Sequential): ...@@ -16,6 +16,15 @@ class Sequential(torch.nn.Sequential):
self._modules[str(len(self._modules))] = module self._modules[str(len(self._modules))] = module
return self return self
def insert(self, index, module):
for i in range(len(self._modules), index, -1):
self._modules[str(i)] = self._modules[str(i - 1)]
self._modules[str(index)] = module
def append(self, module):
self._modules[str(len(self._modules))] = module
return self
def reweight(self, input): def reweight(self, input):
for module in self._modules.values(): for module in self._modules.values():
if isinstance(module, Sequential): if isinstance(module, Sequential):
...@@ -44,3 +53,9 @@ class Sequential(torch.nn.Sequential): ...@@ -44,3 +53,9 @@ class Sequential(torch.nn.Sequential):
else: else:
input = module(input) input = module(input)
return input return input
class CheckpointedSequential(Sequential):
def forward(self, x):
def run(x):
return Sequential.forward(self,x)
return torch.utils.checkpoint.checkpoint(run, x)
...@@ -80,13 +80,16 @@ class Sparsify(Module): ...@@ -80,13 +80,16 @@ class Sparsify(Module):
self.net = Sequential(NetworkInNetwork(nIn,1,True),Sigmoid()) self.net = Sequential(NetworkInNetwork(nIn,1,True),Sigmoid())
else: else:
self.net = NetworkInNetwork(nIn,1,True) self.net = NetworkInNetwork(nIn,1,True)
self.threshold=0.5 if activation else 0
def forward(self,input): def forward(self,input):
if input.features.numel(): if input.features.numel():
output = SparseConvNetTensor() output = SparseConvNetTensor()
output.spatial_size = input.spatial_size output.spatial_size = input.spatial_size
output.metadata = Metadata(self.dimension) output.metadata = Metadata(self.dimension)
output.mask = self.net(input).features.view(-1) output.mask = self.net(input).features.view(-1)
active = output.mask>(0.5 if self.activation else 0) if self.threshold<0:
print(output.mask.mean(),output.mask.std())
active = output.mask>self.threshold
output.features=input.features[active] output.features=input.features[active]
active=active.cpu() active=active.cpu()
input.metadata.sparsifyMetadata( input.metadata.sparsifyMetadata(
......
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
# This source code is licensed under the BSD-style license found in the # This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
# 'SubmanifoldConvolution == SubmanifoldConvolution'
import sparseconvnet import sparseconvnet
import sparseconvnet.SCN import sparseconvnet.SCN
from torch.autograd import Function from torch.autograd import Function
...@@ -14,16 +12,17 @@ from .utils import * ...@@ -14,16 +12,17 @@ from .utils import *
from .sparseConvNetTensor import SparseConvNetTensor from .sparseConvNetTensor import SparseConvNetTensor
class SubmanifoldConvolution(Module): class SubmanifoldConvolution(Module):
def __init__(self, dimension, nIn, nOut, filter_size, bias): def __init__(self, dimension, nIn, nOut, filter_size, bias, groups=1):
Module.__init__(self) Module.__init__(self)
self.dimension = dimension self.dimension = dimension
self.groups = groups
self.nIn = nIn self.nIn = nIn
self.nOut = nOut self.nOut = nOut
self.filter_size = toLongTensor(dimension, filter_size) self.filter_size = toLongTensor(dimension, filter_size)
self.filter_volume = self.filter_size.prod().item() self.filter_volume = self.filter_size.prod().item()
std = (2.0 / nIn / self.filter_volume)**0.5 std = (2.0 * groups / nIn / self.filter_volume)**0.5
self.weight = Parameter(torch.Tensor( self.weight = Parameter(torch.Tensor(
self.filter_volume, nIn, nOut self.filter_volume, groups, nIn//groups, nOut//groups
).normal_(0, std)) ).normal_(0, std))
if bias: if bias:
self.bias = Parameter(torch.Tensor(nOut).zero_()) self.bias = Parameter(torch.Tensor(nOut).zero_())
......
...@@ -132,7 +132,7 @@ def prepare_BLInput(l,f): ...@@ -132,7 +132,7 @@ def prepare_BLInput(l,f):
for i, (ll, ff) in enumerate(zip(l,f)): for i, (ll, ff) in enumerate(zip(l,f)):
L[i,:ll.size(0),:].copy_(ll) L[i,:ll.size(0),:].copy_(ll)
F[i,:ff.size(0),:].copy_(ff) F[i,:ff.size(0),:].copy_(ff)
return (L,F) return [L,F]
def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0): def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0):
if use_cuda: if use_cuda:
...@@ -155,8 +155,10 @@ def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0): ...@@ -155,8 +155,10 @@ def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0):
def is_power2(num): def is_power2(num):
return num != 0 and ((num & (num - 1)) == 0) return num != 0 and ((num & (num - 1)) == 0)
def has_only_one_nonzero_digit(num): #https://oeis.org/A037124 def has_only_one_nonzero_digit(num): #https://oeis.org/A037124
return num != 0 and (num/10**math.floor(math.log(num,10))).is_integer() return num != 0 and (num/10**math.floor(math.log(num,10))).is_integer()
def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True): def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True):
f=exp_name+'-%09d-'%epoch+name2+'.pth' f=exp_name+'-%09d-'%epoch+name2+'.pth'
model.cpu() model.cpu()
...@@ -170,24 +172,128 @@ def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True): ...@@ -170,24 +172,128 @@ def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True):
if not is_power2(epoch): if not is_power2(epoch):
os.remove(f) os.remove(f)
def random_rotation(dimension=3): def random_rotation(dimension=3,allow_mirror=False):
return torch.qr(torch.randn(dimension,dimension))[0] r=torch.qr(torch.randn(dimension,dimension))[0]
f=torch.randint(2,(3,))
class LayerNormLeakyReLU(torch.nn.Module): if f.sum()%2==0 and not allow_mirror:
def __init__(self,num_features,leakiness): f=1-f
torch.nn.Module.__init__(self) return r*(2*f-1).float()
self.leakiness=leakiness
self.in1d=torch.nn.LayerNorm(num_features) def squareroot_rotation(a):
def forward(self,x): import scipy.spatial
if x.features.numel(): b=scipy.spatial.transform.Slerp(
x.features=self.in1d(x.features) [0,1],
x.features=torch.nn.functional.leaky_relu(x.features,self.leakiness,inplace=True) scipy.spatial.transform.Rotation.from_dcm(torch.stack([torch.eye(3),a])))([0.5]).as_dcm()
return x return torch.from_numpy(b).float()[0]
def voxelize_pointcloud(xyz,rgb): def voxelize_pointcloud(xyz,rgb,average=True,accumulate=False):
xyz,inv,counts=np.unique(xyz.long().numpy(),axis=0,return_inverse=True,return_counts=True) if xyz.numel()==0:
return xyz, rgb
if average or accumulate:
xyz,inv,counts=np.unique(xyz.numpy(),axis=0,return_inverse=True,return_counts=True)
xyz=torch.from_numpy(xyz) xyz=torch.from_numpy(xyz)
inv=torch.from_numpy(inv) inv=torch.from_numpy(inv)
rgb_out=torch.zeros(xyz.size(0),rgb.size(1),dtype=torch.float32) rgb_out=torch.zeros(xyz.size(0),rgb.size(1),dtype=torch.float32)
rgb_out.index_add_(0,inv,rgb) rgb_out.index_add_(0,inv,rgb)
return xyz, rgb_out/torch.from_numpy(counts[:,None]).float() if average:
rgb=rgb_out/torch.from_numpy(counts[:,None]).float()
return xyz, rgb
else:
xyz,idxs=np.unique(xyz,axis=0,return_index=True)
xyz=torch.from_numpy(xyz)
rgb=rgb[idxs]
return xyz, rgb
class checkpointFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, run_function, x_features, x_metadata, x_spatial_size):
ctx.run_function = run_function
ctx.save_for_backward(x_features, x_spatial_size)
ctx.x_metadata=x_metadata
with torch.no_grad():
y = run_function(
scn.SparseConvNetTensor
(x_features, x_metadata, x_spatial_size))
return y.features
@staticmethod
def backward(ctx, grad_y_features):
x_features, x_spatial_size = ctx.saved_tensors
x_features = x_features.detach()
x_features.requires_grad = True
with torch.enable_grad():
y = ctx.run_function(
scn.SparseConvNetTensor
(x_features, ctx.x_metadata, x_spatial_size))
torch.autograd.backward(y.features, grad_y_features,retain_graph=False)
return None, x_features.grad, None, None
def checkpoint101(run_function, x, down=1):
f=checkpointFunction.apply(run_function, x.features, x.metadata, x.spatial_size)
s=x.spatial_size//down
return scn.SparseConvNetTensor(f, x.metadata, s)
def matplotlib_cubes(ax, positions,colors):
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
"""
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,15))
ax = fig.gca(projection='3d')
...
plt.show()
"""
try:
positions=positions.numpy()
colors=colors.numpy()
X = np.array([[[0, 1, 0], [0, 0, 0], [1, 0, 0], [1, 1, 0]],
[[0, 0, 0], [0, 0, 1], [1, 0, 1], [1, 0, 0]],
[[1, 0, 1], [1, 0, 0], [1, 1, 0], [1, 1, 1]],
[[0, 0, 1], [0, 0, 0], [0, 1, 0], [0, 1, 1]],
[[0, 1, 0], [0, 1, 1], [1, 1, 1], [1, 1, 0]],
[[0, 1, 1], [0, 0, 1], [1, 0, 1], [1, 1, 1]]]).astype(np.float32)[None]-0.5
X=X+positions[:,None,None,:]
X.resize(X.shape[0]*6,4,3)
m=positions.min(0)
M=positions.max(0)+1
ax.set_xlim([m[0],M[0]])
ax.set_ylim([m[1],M[1]])
ax.set_zlim([m[2],M[2]])
ax.add_collection3d(Poly3DCollection(X,
facecolors=np.repeat(colors,6, axis=0)))
except:
print('matplotlibcubes fail!?!')
pass
ax.set_axis_off()
def matplotlib_planes(ax, positions,colors):
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
"""
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,15))
ax = fig.gca(projection='3d')
...
plt.show()
"""
try:
positions=positions.numpy()
colors=colors.numpy()
X = np.array([[[0, -0.5, 0.5], [0, -0.5, -0.5], [0, 0.5, -0.5], [0, 0.5, 0.5]]]).astype(np.float32)[None]
X=X+positions[:,None,None,:]
X.resize(X.shape[0]*1,4,3)
m=positions.min(0)
M=positions.max(0)+1
ax.set_xlim([m[0],M[0]])
ax.set_ylim([m[1],M[1]])
ax.set_zlim([m[2],M[2]])
ax.add_collection3d(Poly3DCollection(X,
facecolors=np.repeat(colors,1, axis=0)))
except:
pass
ax.set_axis_off()
def visdom_scatter(vis, xyz, rgb, win='3d', markersize=3):
vis.scatter(
xyz,
opts={'markersize': markersize,'markercolor': rgb},
win=win)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment