Commit de3743f6 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Factor out CUDA code

parent f0407b36
...@@ -4,7 +4,31 @@ ...@@ -4,7 +4,31 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "MaxPooling.h" template <typename T>
void MaxPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int *rules,
Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] < input_features[i + plane])
output_features[o + plane] = input_features[i + plane];
}
}
template <typename T>
void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_MaxPooling_updateOutput( void cpu_MaxPooling_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
template <typename T>
void MaxPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] < input_features[i + plane])
output_features[o + plane] = input_features[i + plane];
}
}
template <typename T>
void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite] * input_stride;
Int o = rules[2 * outSite + 1] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
}
#endif /* CPU_MAXPOOLING_H */
...@@ -4,7 +4,30 @@ ...@@ -4,7 +4,30 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "SparseToDense.h" template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *i = input_features + rules[2 * outSite] * nPlanes;
T *o = output_features + rules[2 * outSite + 1];
for (Int plane = 0; plane < nPlanes; plane++)
o[plane * spatialVolume] = i[plane];
}
}
template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
T *d_o = d_output_features + rules[2 * outSite + 1];
for (Int plane = 0; plane < nPlanes; plane++)
d_i[plane] = d_o[plane * spatialVolume];
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_SparseToDense_updateOutput( void cpu_SparseToDense_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *i = input_features + rules[2 * outSite] * nPlanes;
T *o = output_features + rules[2 * outSite + 1];
for (Int plane = 0; plane < nPlanes; plane++)
o[plane * spatialVolume] = i[plane];
}
}
template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume, Int *rules,
int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
T *d_o = d_output_features + rules[2 * outSite + 1];
for (Int plane = 0; plane < nPlanes; plane++)
d_i[plane] = d_o[plane * spatialVolume];
}
}
#endif /* CPU_SPARSETODENSE_H */
...@@ -4,7 +4,28 @@ ...@@ -4,7 +4,28 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "UnPooling.h" template <typename T>
void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int *rules,
Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
output_features[o + plane] += input_features[i + plane];
}
}
template <typename T>
void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
d_input_features[i + plane] += d_output_features[o + plane];
}
}
template <typename T, Int Dimension> template <typename T, Int Dimension>
void cpu_UnPooling_updateOutput( void cpu_UnPooling_updateOutput(
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
template <typename T>
void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int *rules,
Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
output_features[o + plane] += input_features[i + plane];
}
}
template <typename T>
void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
for (Int outSite = 0; outSite < nHot; outSite++) {
Int i = rules[2 * outSite + 1] * input_stride;
Int o = rules[2 * outSite] * output_stride;
for (Int plane = 0; plane < nPlanes; plane++)
d_input_features[i + plane] += d_output_features[o + plane];
}
}
#endif /* CPU_UNPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average);
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average);
template <typename T, Int Dimension>
void cuda_ActivePooling_updateOutput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, bool average) {
Int nPlanes = input_features.size(1);
auto _rules = m.getActivePoolingRuleBook(inputSize);
Int batchSize = _rules[1][0];
Int maxActive = _rules[1][1];
output_features.resize_({batchSize, nPlanes});
output_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
ActivePooling_ForwardPass<T>(iF, oF, batchSize, maxActive, nPlanes,
&_rules[0][0], average);
}
template <typename T, Int Dimension>
void cuda_ActivePooling_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, bool average) {
Int nPlanes = input_features.size(1);
auto _rules = m.getActivePoolingRuleBook(inputSize);
Int batchSize = _rules[1][0];
Int maxActive = _rules[1][1];
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
ActivePooling_BackwardPass<T>(diF, doF, batchSize, maxActive, nPlanes,
&_rules[0][0], average);
}
...@@ -4,64 +4,73 @@ ...@@ -4,64 +4,73 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "ActivePooling.h" template <typename T>
__global__ void ActivePooling_fp(T *input_features, T *output_features,
template <typename T, Int Dimension> Int maxActive, Int nPlanes, Int *rules,
void cuda_ActivePooling_updateOutput( bool average) {
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m, T *out = &output_features[blockIdx.x * nPlanes];
/*cuda float*/ at::Tensor input_features, Int *r = &rules[blockIdx.x * (maxActive + 1)];
/*cuda float*/ at::Tensor output_features, bool average) { Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
Int nPlanes = input_features.size(1); while (nActive-- > 0) {
auto _rules = m.getActivePoolingRuleBook(inputSize); T *inp = &input_features[(*r++) * nPlanes];
Int batchSize = _rules[1][0]; for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
Int maxActive = _rules[1][1]; out[plane] += inp[plane] * multiplier;
output_features.resize_({batchSize, nPlanes}); }
output_features.zero_(); }
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22}); auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
Int *rb = rulesBuffer.data<Int>(); Int *rb = rulesBuffer.data<Int>();
Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1)); Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
assert(rowBatchSize > 0); assert(rowBatchSize > 0);
Int kernelBlockDim = std::min(nPlanes, (Int)32);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
for (Int o = 0; o < batchSize; o += rowBatchSize) { for (Int o = 0; o < batchSize; o += rowBatchSize) {
Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o))); Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)], cudaMemcpy(rb, rules + o * (maxActive + 1),
sizeof(Int) * (maxActive + 1) * batchSize_, sizeof(Int) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
ActivePooling_ForwardPass<T>(iF, oF + o * nPlanes, batchSize_, maxActive, ActivePooling_fp<T><<<batchSize_, kernelBlockDim>>>(
nPlanes, rb, average); input_features, output_features + 0 * nPlanes, maxActive, nPlanes,
rules, average);
}
}
template <typename T>
__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
T *out = &d_output_features[blockIdx.x * nPlanes];
Int *r = &rules[blockIdx.x * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
inp[plane] = out[plane] * multiplier;
} }
} }
template <typename T, Int Dimension>
void cuda_ActivePooling_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, bool average) {
Int nPlanes = input_features.size(1);
auto _rules = m.getActivePoolingRuleBook(inputSize);
Int batchSize = _rules[1][0];
Int maxActive = _rules[1][1];
d_input_features.resize_as_(input_features);
d_input_features.zero_();
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22}); auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
Int *rb = rulesBuffer.data<Int>(); Int *rb = rulesBuffer.data<Int>();
Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1)); Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
assert(rowBatchSize > 0); assert(rowBatchSize > 0);
Int kernelBlockDim = std::min(nPlanes, (Int)32);
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
for (Int o = 0; o < batchSize; o += rowBatchSize) { for (Int o = 0; o < batchSize; o += rowBatchSize) {
Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o))); Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)], cudaMemcpy(rb, rules + o * (maxActive + 1),
sizeof(Int) * (maxActive + 1) * batchSize_, sizeof(Int) * (maxActive + 1) * batchSize_,
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
ActivePooling_BackwardPass<T>(diF, doF + o * nPlanes, batchSize_, maxActive, ActivePooling_bp<T><<<batchSize_, kernelBlockDim>>>(
nPlanes, rb, average); d_input_features, d_output_features + o * nPlanes, maxActive, nPlanes,
rules, average);
} }
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_ACTIVEPOOLING_H
#define CUDA_ACTIVEPOOLING_H
template <typename T>
__global__ void ActivePooling_fp(T *input_features, T *output_features,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
T *out = &output_features[blockIdx.x * nPlanes];
Int *r = &rules[blockIdx.x * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &input_features[(*r++) * nPlanes];
for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
out[plane] += inp[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_ForwardPass(T *input_features, T *output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
Int kernelBlockDim = std::min(nPlanes, (Int)32);
ActivePooling_fp<T><<<batchSize, kernelBlockDim>>>(
input_features, output_features, maxActive, nPlanes, rules, average);
}
template <typename T>
__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
Int maxActive, Int nPlanes, Int *rules,
bool average) {
T *out = &d_output_features[blockIdx.x * nPlanes];
Int *r = &rules[blockIdx.x * (maxActive + 1)];
Int nActive = *r++;
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
while (nActive-- > 0) {
T *inp = &d_input_features[(*r++) * nPlanes];
for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
inp[plane] = out[plane] * multiplier;
}
}
template <typename T>
void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int batchSize, Int maxActive, Int nPlanes,
Int *rules, bool average) {
Int kernelBlockDim = std::min(nPlanes, (Int)32);
ActivePooling_bp<T><<<batchSize, kernelBlockDim>>>(
d_input_features, d_output_features, maxActive, nPlanes, rules, average);
}
#endif /* CUDA_ActivePOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template <typename T>
void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive);
template <typename T>
void dAffineReluTrivialConvolution_backward_dW(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad);
template <typename T>
double cuda_AffineReluTrivialConvolution_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor affineWeight,
/*cuda float*/ at::Tensor affineBias,
/*cuda float*/ at::Tensor convWeight) {
output_features.resize_({input_features.size(0), convWeight.size(1)});
dAffineReluTrivialConvolution_forward<T>(
input_features.data<T>(), output_features.data<T>(),
affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
convWeight.size(0), input_features.stride(0), convWeight.size(1),
output_features.size(1), input_features.size(0));
return input_features.size(0) * input_features.size(1) *
output_features.size(1);
}
template <typename T>
void cuda_AffineReluTrivialConvolution_backward(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor affineWeight,
/*cuda float*/ at::Tensor d_affineWeight,
/*cuda float*/ at::Tensor affineBias,
/*cuda float*/ at::Tensor d_affineBias,
/*cuda float*/ at::Tensor convWeight,
/*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
d_input_features.resize_as_(input_features);
dAffineReluTrivialConvolution_backward_dW<T>(
input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), affineWeight.data<T>(),
d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
input_features.size(0), additiveGrad);
}
...@@ -4,44 +4,432 @@ ...@@ -4,44 +4,432 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h" // check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardA(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardB(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nActive) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template <typename T> template <typename T>
double cuda_AffineReluTrivialConvolution_updateOutput( void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
/*cuda float*/ at::Tensor input_features, T *affineWeight, T *affineBias,
/*cuda float*/ at::Tensor output_features, T *convWeight, Int input_nPlanes,
/*cuda float*/ at::Tensor affineWeight, Int input_stride, Int output_nPlanes,
/*cuda float*/ at::Tensor affineBias, Int output_stride, Int nActive) {
/*cuda float*/ at::Tensor convWeight) {
FOO(T, 64, 16)
output_features.resize_({input_features.size(0), convWeight.size(1)}); FOO(T, 32, 8)
dAffineReluTrivialConvolution_forward<T>( FOO(T, 16, 4)
input_features.data<T>(), output_features.data<T>(), FOO(T, 8, 2)
affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(), assert(false);
convWeight.size(0), input_features.stride(0), convWeight.size(1), }
output_features.size(1), input_features.size(0)); template <>
return input_features.size(0) * input_features.size(1) * void dAffineReluTrivialConvolution_forward<double>(
output_features.size(1); double *inFeatures, double *outFeatures, double *affineWeight,
double *affineBias, double *convWeight, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride, Int nActive) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
} else {
i[v] = 0;
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
} }
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template <typename T> template <typename T>
void cuda_AffineReluTrivialConvolution_backward( void dAffineReluTrivialConvolution_backward_dW(
/*cuda float*/ at::Tensor input_features, T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
/*cuda float*/ at::Tensor d_input_features, T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
/*cuda float*/ at::Tensor d_output_features, T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
/*cuda float*/ at::Tensor affineWeight, Int output_stride, Int nActive, bool additiveGrad) {
/*cuda float*/ at::Tensor d_affineWeight, FOO(T, 32, 8)
/*cuda float*/ at::Tensor affineBias, FOO(T, 16, 4)
/*cuda float*/ at::Tensor d_affineBias, FOO(T, 8, 2)
/*cuda float*/ at::Tensor convWeight,
/*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
d_input_features.resize_as_(input_features);
dAffineReluTrivialConvolution_backward_dW<T>(
input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), affineWeight.data<T>(),
d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
input_features.size(0), additiveGrad);
} }
#undef FOO
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardA(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_forwardB(
T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
convWeight += n * K;
T O[V];
__shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read affineWeight, affineBias and convWeight
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
// Read input, do affine + relu, set O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nActive) {
T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
I[ty[v]][tx] = (i > 0) ? i : 0;
if (m == 0) {
O[v] = 0;
} else {
O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
}
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * CW[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive)
outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
__syncthreads();
}
affineWeight += K;
affineBias += K;
convWeight += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template <typename T>
void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
T *affineWeight, T *affineBias,
T *convWeight, Int input_nPlanes,
Int input_stride,
Int output_nPlanes,
Int output_stride, Int nActive) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dAffineReluTrivialConvolution_forward<double>(
double *inFeatures, double *outFeatures, double *affineWeight,
double *affineBias, double *convWeight, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int output_stride, Int nActive) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dAffineReluTrivialConvolution_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
convWeight += m * K * output_nPlanes;
dConvWeight += m * K * output_nPlanes;
affineWeight += m * K;
dAffineWeight += m * K;
affineBias += m * K;
dAffineBias += m * K;
T dI[V];
T dCW[V];
T i[V];
T dAW = 0;
T dAB = 0;
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T AW[K];
__shared__ T AB[K];
__shared__ T CW[K][K];
const Int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (ty[0] == 0) {
AW[tx] = affineWeight[tx];
AB[tx] = affineBias[tx];
}
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
dCW[v] = 0;
}
__syncthreads();
for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] = 0;
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
i[v] = i_;
i_ = i_ * AW[tx] + AB[tx];
I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
} else {
i[v] = 0;
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * CW[tx][k];
dCW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nActive) {
dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
dAW += i[v] * dI[v];
dAB += dI[v];
if (additiveGrad)
dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
else
dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
}
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
convWeight += K;
dConvWeight += K;
dOutFeatures += K;
__syncthreads();
}
atomicAdd(&dAffineWeight[tx], dAW);
atomicAdd(&dAffineBias[tx], dAB);
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template <typename T>
void dAffineReluTrivialConvolution_backward_dW(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride, Int nActive, bool additiveGrad) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
}
#undef FOO
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, RuleBook _rules,
Int filterVolume);
template <typename T>
void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, RuleBook _rules,
Int filterVolume);
template <typename T, Int Dimension>
void cuda_AveragePooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
cuda_AveragePooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), _rules,
_rules.size());
}
template <typename T, Int Dimension>
void cuda_AveragePooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>() + nFeaturesToDrop;
auto doF = d_output_features.data<T>();
cuda_AveragePooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), _rules,
_rules.size());
}
...@@ -4,51 +4,73 @@ ...@@ -4,51 +4,73 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h" #include "RuleBookIterator.h"
template <typename T, Int Dimension> // NTX must be >=2 so r is filled properly
void cuda_AveragePooling_updateOutput( template <typename T, Int NTX, Int NTY>
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, __global__ void AveragePooling_fp(T *input_features, T *output_features,
/*long*/ at::Tensor poolSize, Int nPlanes, Int input_stride,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m, Int output_stride, Int *rules, Int nHot,
/*cuda float*/ at::Tensor input_features, T alpha) {
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) { __shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop; {
auto _rules = Int i = threadIdx.x + NTX * threadIdx.y;
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true); if (i < NTY * 2 and i < 2 * (nHot - n))
Int nActive = m.getNActive(outputSize); r[i] = rules[2 * n + i];
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop}); }
output_features.zero_(); __syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
atomicAdd(&output_features[o + plane],
alpha * input_features[i + plane]);
}
__syncthreads();
}
}
auto iF = input_features.data<T>() + nFeaturesToDrop; template <typename T>
auto oF = output_features.data<T>(); void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
RULEBOOKITERATOR(cuda_AveragePooling_ForwardPass<T>( Int nPlanes, Int input_stride,
iF, oF, nPlanes, input_features.size(1), Int output_stride, RuleBook _rules,
output_features.size(1), rbB, nHotB, _rules.size()); Int filterVolume) {
RULEBOOKITERATOR((AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rbB, nHotB, 1.0 / filterVolume));
, ) , )
} }
template <typename T, Int NTX, Int NTY>
__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
T alpha) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += alpha * d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T, Int Dimension> template <typename T>
void cuda_AveragePooling_updateGradInput( void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, Int nPlanes, Int input_stride,
/*long*/ at::Tensor poolSize, Int output_stride, RuleBook _rules,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m, Int filterVolume) {
/*cuda float*/ at::Tensor input_features, RULEBOOKITERATOR((AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
/*cuda float*/ at::Tensor d_input_features, d_input_features, d_output_features, nPlanes, input_stride, output_stride,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) { rbB, nHotB, 1.0 / filterVolume));
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>() + nFeaturesToDrop;
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(cuda_AveragePooling_BackwardPass<T>(
diF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), rbB, nHotB, _rules.size());
, ) , )
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AVERAGEPOOLING_H
#define CUDA_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void AveragePooling_fp(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
T alpha) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
atomicAdd(&output_features[o + plane],
alpha * input_features[i + plane]);
}
__syncthreads();
}
}
template <typename T>
void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
Int filterVolume) {
AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
template <typename T, Int NTX, Int NTY>
__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot,
T alpha) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += alpha * d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules,
Int nHot, Int filterVolume) {
AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, input_stride, output_stride,
rules, nHot, 1.0 / filterVolume);
}
#endif /* CUDA_AVERAGEPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void bn_f(T *iF, T *oF, Int nPlanes, Int input_stride, Int output_stride,
Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum, bool train,
T leakiness);
template <typename T>
void bn_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
T *runningMean, T *runningVar, T *weight, T *bias, T *d_weight,
T *d_bias, T leakiness);
template <typename T>
void cuda_BatchNormalization_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
output_features.resize_as_(input_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
bn_f(input_features.data<T>(), output_features.data<T>(), nPlanes,
input_stride, output_stride, nActive, saveMean.data<T>(),
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps,
momentum, train, leakiness);
}
}
template <typename T>
void cuda_BatchNormalization_backward(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
/*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias,
/*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
T leakiness) {
d_input_features.resize_as_(d_output_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
bn_b(input_features.data<T>(), d_input_features.data<T>(),
output_features.data<T>(), d_output_features.data<T>(), nPlanes,
input_stride, output_stride, nActive, saveMean.data<T>(),
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias),
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias),
leakiness);
}
}
...@@ -4,56 +4,215 @@ ...@@ -4,56 +4,215 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h" #include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template <typename T, Int NTX, Int NTY>
__global__ void BatchNormalization_f_train(
T *input_features, T *output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum, T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
row < nActive; row += NTY, c += input_stride * NTY) {
T i = input_features[c];
t[threadIdx.y][threadIdx.x] += i;
t2[threadIdx.y][threadIdx.x] += i * i;
}
__syncthreads();
T _saveMean = 0;
T _saveInvStd = 0;
for (Int row = 0; row < NTY; row++) {
_saveMean += t[row][threadIdx.x];
_saveInvStd += t2[row][threadIdx.x];
}
_saveMean /= nActive;
_saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
if (threadIdx.y == 0) {
saveMean[plane] = _saveMean;
runningMean[plane] =
momentum * runningMean[plane] + (1 - momentum) * _saveMean;
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * _saveInvStd / (nActive - 1);
}
_saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
if (threadIdx.y == 0)
saveInvStd[plane] = _saveInvStd;
__syncthreads();
if (threadIdx.y == 0) {
t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
t[1][threadIdx.x] =
-_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
}
__syncthreads();
T W = t[0][threadIdx.x];
T B = t[1][threadIdx.x];
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = W * input_features[ci] + B;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
__global__ void BatchNormalization_f_test(
T *input_features, T *output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum, T leakiness) {
__shared__ T W[NTX];
__shared__ T B[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
W[threadIdx.x] =
pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
B[threadIdx.x] =
(bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
}
__syncthreads();
float w = W[threadIdx.x], b = B[threadIdx.x];
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = w * input_features[ci] + b;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean,
T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
BatchNormalization_f_train<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
} else {
BatchNormalization_f_test<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
}
}
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
T *runningMean, T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
t[0][threadIdx.x] = saveMean[plane];
t[1][threadIdx.x] = saveInvStd[plane];
t[2][threadIdx.x] = (weight ? weight[plane] : 1);
}
__syncthreads();
T _saveMean = t[0][threadIdx.x];
T _saveInvStd = t[1][threadIdx.x];
T _weight = t[2][threadIdx.x];
__syncthreads();
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
t[threadIdx.y][threadIdx.x] += d;
t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
}
__syncthreads();
T gradMean = 0;
T dotp = 0;
for (int row = 0; row < NTY; row++) {
gradMean += t[row][threadIdx.x];
dotp += t2[row][threadIdx.x];
}
__syncthreads();
if (d_weight)
d_weight[plane] = dotp * _saveInvStd;
if (d_bias)
d_bias[plane] = gradMean; // sum really
gradMean /= nActive;
T k = dotp * _saveInvStd * _saveInvStd / nActive;
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
d_input_features[ci] = (d_output_features[co] - gradMean -
(input_features[ci] - _saveMean) * k) *
_saveInvStd * _weight;
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
BatchNormalization_b<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
}
#define BN_F_MACRO(N) \ #define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \ if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \ BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \ iF, oF, nPlanes, input_stride, output_stride, nActive, saveMean, \
input_stride, output_stride, nActive, saveMean.data<T>(), \ saveInvStd, runningMean, runningVar, weight, bias, eps, momentum, \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \ train, leakiness); \
} }
template <typename T> template <typename T>
void cuda_BatchNormalization_updateOutput( void bn_f(T *iF, T *oF, Int nPlanes, Int input_stride, Int output_stride,
/*cuda float*/ at::Tensor input_features, Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
/*cuda float*/ at::Tensor output_features, T *runningVar, T *weight, T *bias, T eps, T momentum, bool train,
/*cuda float*/ at::Tensor saveMean, T leakiness) {
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean, BN_F_MACRO(16)
/*cuda float*/ at::Tensor runningVar, else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
output_features.resize_as_(input_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
template <typename T>
void cuda_BatchNormalizationInTensor_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
} }
#undef BN_F_MACRO #undef BN_F_MACRO
...@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput( ...@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
#define BN_B_MACRO(N) \ #define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \ if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \ BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \ input_features, d_input_features, output_features, d_output_features, \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \ nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd, \
input_stride, output_stride, nActive, saveMean.data<T>(), \ runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness); \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
} }
template <typename T> template <typename T>
void cuda_BatchNormalization_backward( void bn_b(T *input_features, T *d_input_features, T *output_features,
/*cuda float*/ at::Tensor input_features, T *d_output_features, Int nPlanes, Int input_stride,
/*cuda float*/ at::Tensor d_input_features, Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
/*cuda float*/ at::Tensor output_features, T *runningMean, T *runningVar, T *weight, T *bias, T *d_weight,
/*cuda float*/ at::Tensor d_output_features, T *d_bias, T leakiness) {
/*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd, BN_B_MACRO(16)
/*cuda float*/ at::Tensor runningMean, else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
/*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias,
/*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
T leakiness) {
d_input_features.resize_as_(d_output_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
} }
#undef BN_B_MACRO
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHNORMALIZATION_H
#define CUDA_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_f_train(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
row < nActive; row += NTY, c += input_stride * NTY) {
T i = input_features[c];
t[threadIdx.y][threadIdx.x] += i;
t2[threadIdx.y][threadIdx.x] += i * i;
}
__syncthreads();
T _saveMean = 0;
T _saveInvStd = 0;
for (Int row = 0; row < NTY; row++) {
_saveMean += t[row][threadIdx.x];
_saveInvStd += t2[row][threadIdx.x];
}
_saveMean /= nActive;
_saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
if (threadIdx.y == 0) {
saveMean[plane] = _saveMean;
runningMean[plane] =
momentum * runningMean[plane] + (1 - momentum) * _saveMean;
runningVar[plane] = momentum * runningVar[plane] +
(1 - momentum) * _saveInvStd / (nActive - 1);
}
_saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
if (threadIdx.y == 0)
saveInvStd[plane] = _saveInvStd;
__syncthreads();
if (threadIdx.y == 0) {
t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
t[1][threadIdx.x] =
-_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
}
__syncthreads();
T W = t[0][threadIdx.x];
T B = t[1][threadIdx.x];
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = W * input_features[ci] + B;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_f_test(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T W[NTX];
__shared__ T B[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
W[threadIdx.x] =
pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
B[threadIdx.x] =
(bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
}
__syncthreads();
float w = W[threadIdx.x], b = B[threadIdx.x];
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T out = w * input_features[ci] + b;
output_features[co] = (out > 0) ? out : (out * leakiness);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
BatchNormalization_f_train<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
} else {
BatchNormalization_f_test<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
}
}
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean,
T *saveInvStd, T *runningMean, T *runningVar, T *weight,
T *bias, T *d_weight, T *d_bias, T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
t[0][threadIdx.x] = saveMean[plane];
t[1][threadIdx.x] = saveInvStd[plane];
t[2][threadIdx.x] = (weight ? weight[plane] : 1);
}
__syncthreads();
T _saveMean = t[0][threadIdx.x];
T _saveInvStd = t[1][threadIdx.x];
T _weight = t[2][threadIdx.x];
__syncthreads();
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
T d = d_output_features[co];
d = (output_features[co] > 0) ? d : (d * leakiness);
d_output_features[co] = d;
t[threadIdx.y][threadIdx.x] += d;
t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
}
__syncthreads();
T gradMean = 0;
T dotp = 0;
for (int row = 0; row < NTY; row++) {
gradMean += t[row][threadIdx.x];
dotp += t2[row][threadIdx.x];
}
__syncthreads();
if (d_weight)
d_weight[plane] = dotp * _saveInvStd;
if (d_bias)
d_bias[plane] = gradMean; // sum really
gradMean /= nActive;
T k = dotp * _saveInvStd * _saveInvStd / nActive;
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
d_input_features[ci] = (d_output_features[co] - gradMean -
(input_features[ci] - _saveMean) * k) *
_saveInvStd * _weight;
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
BatchNormalization_b<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
}
#undef NTX
#undef NTY
#endif /* CUDA_BATCHNORMALIZATION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void bmd_f(T *input_features, T *output_features, T *noise, Int nActive,
Int nPlanes, T alpha);
template <typename T>
void bmd_b(T *input_features, T *d_input_features, T *d_output_features,
T *noise, Int nActive, Int nPlanes, T alpha);
template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
T alpha) {
output_features.resize_as_(input_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
bmd_f(input_features.data<T>(), output_features.data<T>(), noise.data<T>(),
nActive, nPlanes, alpha);
}
template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor noise, T alpha) {
d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
bmd_b(input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, alpha);
}
...@@ -4,28 +4,63 @@ ...@@ -4,28 +4,63 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "BatchwiseMultiplicativeDropout.h" template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise,
Int nActive, Int nPlanes,
Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_bp(
T *input_features, T *d_input_features, T *d_output_features, T *noise,
Int nActive, Int nPlanes, Int input_stride, Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
#define SPARSECONVNET_FOO(NTX, NTY) \ #define SPARSECONVNET_FOO(NTX, NTY) \
{ \ { \
if (nPlanes % NTX == 0) { \ if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp< \ BatchwiseMultiplicativeDropout_fp<T, NTX, NTY><<< \
T, NTX, \ dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \ input_features, output_features, noise, nActive, nPlanes, nPlanes, \
input_features.data<T>(), output_features.data<T>(), \ nPlanes, alpha); \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \ return; \
} \ } \
} }
template <typename T> template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateOutput( void bmd_f(T *input_features, T *output_features, T *noise, Int nActive,
/*cuda float*/ at::Tensor input_features, Int nPlanes, T alpha) {
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
float alpha) {
output_features.resize_as_(input_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
SPARSECONVNET_FOO(32, 32) SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32) SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64) SPARSECONVNET_FOO(16, 64)
...@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput( ...@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
#define SPARSECONVNET_FOO(NTX, NTY) \ #define SPARSECONVNET_FOO(NTX, NTY) \
{ \ { \
if (nPlanes % NTX == 0) { \ if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp< \ BatchwiseMultiplicativeDropout_bp<T, NTX, NTY><<< \
T, NTX, \ dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \ input_features, d_input_features, d_output_features, noise, nActive, \
input_features.data<T>(), d_input_features.data<T>(), \ nPlanes, nPlanes, nPlanes, alpha); \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \ return; \
} \ } \
} }
template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor noise, float alpha) {
d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
template <typename T>
void bmd_b(T *input_features, T *d_input_features, T *d_output_features,
T *noise, Int nActive, Int nPlanes, T alpha) {
SPARSECONVNET_FOO(32, 32) SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32) SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64) SPARSECONVNET_FOO(16, 64)
...@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput( ...@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
SPARSECONVNET_FOO(4, 64) SPARSECONVNET_FOO(4, 64)
SPARSECONVNET_FOO(1, 64) SPARSECONVNET_FOO(1, 64)
} }
#undef SPARSECONVNET_FOO #undef SPARSECONVNET_FOO
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment