Commit de3743f6 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Factor out CUDA code

parent f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise,
Int nActive, Int nPlanes,
Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
template <typename T, Int NTX, Int NTY>
__global__ void
BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
T *d_output_features, T *noise, Int nActive,
Int nPlanes, Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS);
template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T, Int Dimension>
double cuda_Convolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
template <typename T, Int Dimension>
double cuda_SubmanifoldConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
if (nActive) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActive, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActive);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_SubmanifoldConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
if (nActive) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActive, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_FullConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActiveOut = mOut.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_FullConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActiveIn = mIn.getNActive(inputSize);
Int nActiveOut = mOut.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
template <typename T, Int Dimension>
double cuda_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
...@@ -4,315 +4,649 @@ ...@@ -4,315 +4,649 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h" #include "RuleBookIterator.h"
template <typename T, Int Dimension> template <typename T>
double cuda_Convolution_updateOutput( __global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes,
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, Int nActive) {
/*long*/ at::Tensor filterSize, Int n = blockIdx.x * 32 + threadIdx.x;
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m, T b = bias[n];
/*cuda float*/ at::Tensor input_features, output_features += n;
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight, for (Int row = blockIdx.y; row < nActive; row += gridDim.y) {
/*cuda float*/ at::Tensor bias) { output_features[row * nPlanes] = b;
}
auto _rules = }
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0; template <typename T>
if (nActive) { void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) {
auto iF = input_features.data<T>(); if (nPlanes / 32 > 0)
auto oF = output_features.data<T>(); Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes,
Int ip = input_features.size(1); nActive);
Int op = output_features.size(1); if (nPlanes % 32 > 0) {
auto w = weight.data<T>(); Int o = nPlanes / 32 * 32;
Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes,
if (bias.numel()) { nActive);
auto b = bias.data<T>(); }
for (Int i = 0; i < op; i += 32) { }
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive); template <typename T>
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op, __global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
nActive); Int nCOLUMNS) {
Int i = blockIdx.x * 32 + threadIdx.x;
T t = 0;
for (Int j = blockIdx.y; j < nRows; j += 32)
t += matrix[j * nCOLUMNS + i];
atomicAdd(&target[i], t);
}
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
if (nColumns / 32 > 0)
dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(matrix, target, nRows, nColumns,
nCOLUMNS);
if (nColumns % 32 > 0) {
Int o = nColumns / 32 * 32;
dColumnSum<<<dim3(1, 32), nColumns - o>>>(matrix + o, target + o, nRows,
nColumns, nCOLUMNS);
}
}
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
} }
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
} }
Int c = ip * op; w += K * output_nPlanes;
RULEBOOKITERATOR( inFeatures += K;
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
} }
return flops;
} }
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
template <typename T, Int Dimension> // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
void cuda_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, Int M = input_nPlanes / K;
/*long*/ at::Tensor filterSize, // N = gridDim.y == output_nPlanes/K
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m, Int n = blockIdx.y;
/*cuda float*/ at::Tensor input_features, outFeatures += n * K;
/*cuda float*/ at::Tensor d_input_features, w += n * K;
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight, T O[V];
/*cuda float*/ at::Tensor d_bias) { __shared__ T W[K][K];
__shared__ T I[K][K];
auto _rules = Int R0[V];
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true); Int R1[V];
Int nActive = m.getNActive(outputSize); const int tx = threadIdx.x;
d_input_features.resize_as_(input_features); int ty[V];
d_input_features.zero_(); #pragma unroll
for (int v = 0; v < V; v++)
if (nActive) { ty[v] = threadIdx.y + v * (K / V);
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>(); for (int m = 0; m < M; m++) {
auto doF = d_output_features.data<T>(); // Read w
Int ip = input_features.size(1); #pragma unroll
Int op = d_output_features.size(1); for (int v = 0; v < V; v++)
auto w = weight.data<T>(); W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
auto dw = d_weight.data<T>();
Int c = ip * op; for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB, #pragma unroll
nHotB, ip, ip, op, op); for (int v = 0; v < V; v++) {
, w += c; dw += c;) if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
if (d_bias.numel()) { #pragma unroll
auto db = d_bias.data<T>(); for (int k = 0; k < K; k++)
Convolution_bp_bias(doF, db, op, op, nActive); #pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
} }
w += K * output_nPlanes;
inFeatures += K;
} }
} }
template <typename T, Int Dimension> #define FOO(T, K, V) \
double cuda_SubmanifoldConvolution_updateOutput( { \
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize, if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Metadata<Dimension> &m, Int o = (nHot / K) * K; \
/*cuda float*/ at::Tensor input_features, if (o >= K) \
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight, dConvolution_KMxKN_forwardA< \
/*cuda float*/ at::Tensor bias) { T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true); input_nPlanes, input_stride, \
Int nActive = m.getNActive(inputSize); output_nPlanes, output_stride); \
output_features.resize_({nActive, weight.size(2)}); if (nHot > o) \
if (bias.numel() and nActive) dConvolution_KMxKN_forwardB< \
output_features.copy_(bias); T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
else inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
output_features.zero_(); input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
double flops = 0; } \
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
} }
return flops;
template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
} }
#undef FOO
template <typename T, Int Dimension> // dOutput x W^T -> dInput and
void cuda_SubmanifoldConvolution_backward( // Input^T x dOutput -> dW
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize, // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
Metadata<Dimension> &m, template <typename T, Int K, Int V>
/*cuda float*/ at::Tensor input_features, __global__ void
/*cuda float*/ at::Tensor d_input_features, dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
/*cuda float*/ at::Tensor d_output_features, T *w, T *dw, Int *rules, Int nHot,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight, Int input_nPlanes, Int input_stride,
/*cuda float*/ at::Tensor d_bias) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true); Int N = output_nPlanes / K;
Int nActive = m.getNActive(inputSize); Int m = blockIdx.y;
d_input_features.resize_as_(input_features); inFeatures += m * K;
d_input_features.zero_(); dInFeatures += m * K;
w += m * K * output_nPlanes;
if (nActive) { dw += m * K * output_nPlanes;
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>(); T dI[V];
auto doF = d_output_features.data<T>(); T dW[V];
Int ip = input_features.size(1); __shared__ T I[K][K];
Int op = d_output_features.size(1); __shared__ T dO[K][K];
auto w = weight.data<T>(); __shared__ T W[K][K];
auto dw = d_weight.data<T>(); Int R0[V];
Int c = ip * op; Int R1[V];
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB, const int tx = threadIdx.x;
nHotB, ip, ip, op, op); int ty[V];
, w += c; dw += c;) #pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
if (d_bias.numel()) { for (int n = 0; n < N; n++) {
auto db = d_bias.data<T>(); // Read w, reset dW
Convolution_bp_bias(doF, db, op, op, nActive); #pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
} }
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
} }
} }
template <typename T, Int Dimension> // dOutput x W^T -> dInput and
double cuda_FullConvolution_updateOutput( // Input^T x dOutput -> dW
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
/*long*/ at::Tensor filterSize, template <typename T, Int K, Int V>
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn, __global__ void
Metadata<Dimension> &mOut, dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
/*cuda float*/ at::Tensor input_features, T *w, T *dw, Int *rules, Int nHot,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight, Int input_nPlanes, Int input_stride,
/*cuda float*/ at::Tensor bias) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize, Int N = output_nPlanes / K;
filterSize, filterStride, mOut); Int m = blockIdx.y;
Int nActive = mOut.getNActive(outputSize); inFeatures += m * K;
output_features.resize_({nActive, weight.size(2)}); dInFeatures += m * K;
if (not bias.numel()) w += m * K * output_nPlanes;
output_features.zero_(); dw += m * K * output_nPlanes;
double flops = 0;
if (nActive) { T dI[V];
auto iF = input_features.data<T>(); T dW[V];
auto oF = output_features.data<T>(); __shared__ T I[K][K];
Int ip = input_features.size(1); __shared__ T dO[K][K];
Int op = output_features.size(1); __shared__ T W[K][K];
auto w = weight.data<T>(); Int R0[V];
Int R1[V];
if (bias.numel()) { const int tx = threadIdx.x;
auto b = bias.data<T>(); int ty[V];
for (Int i = 0; i < op; i += 32) { #pragma unroll
Int blockDim = min((Int)32, op - i); for (int v = 0; v < V; v++)
Int gridDim = min((Int)4096, nActive); ty[v] = threadIdx.y + v * (K / V);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive); for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
} }
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
} }
Int c = ip * op; #pragma unroll
RULEBOOKITERATOR( for (int v = 0; v < V; v++)
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op); atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
, w += c; flops += nHotB * c;) w += K;
dw += K;
dOutFeatures += K;
} }
return flops;
} }
template <typename T, Int Dimension> #define FOO(T, K, V) \
void cuda_FullConvolution_backward( { \
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
/*long*/ at::Tensor filterSize, Int o = (nHot / K) * K; \
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn, if (o >= K) \
Metadata<Dimension> &mOut, dConvolution_KMxKN_backward_dW_A< \
/*cuda float*/ at::Tensor input_features, T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
/*cuda float*/ at::Tensor d_input_features, dim3(K, K / V)>>>( \
/*cuda float*/ at::Tensor d_output_features, inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight, input_nPlanes, input_stride, output_nPlanes, output_stride); \
/*cuda float*/ at::Tensor d_bias) { if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize, T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
filterSize, filterStride, mOut); inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
Int nActive = mOut.getNActive(outputSize); nHot - o, input_nPlanes, input_stride, output_nPlanes, \
d_input_features.resize_as_(input_features); output_stride); \
d_input_features.zero_(); return; \
if (nActive) { } \
auto iF = input_features.data<T>(); }
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>(); template <typename T>
Int ip = input_features.size(1); void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
Int op = d_output_features.size(1); T *w, T *dw, Int *rules, Int nHot,
auto w = weight.data<T>(); Int input_nPlanes, Int input_stride,
auto dw = d_weight.data<T>(); Int output_nPlanes, Int output_stride) {
Int c = ip * op; FOO(T, 32, 8)
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB, FOO(T, 16, 4)
nHotB, ip, ip, op, op); FOO(T, 8, 2)
, w += c; dw += c;) assert(false);
}
#undef FOO
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
Int KI = min(K, input_nPlanes - K * m);
if (d_bias.numel()) { // Read w
auto db = d_bias.data<T>(); #pragma unroll
Convolution_bp_bias(doF, db, op, op, nActive); for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads();
} }
w += K * output_nPlanes;
inFeatures += K;
} }
} }
template <typename T, Int Dimension>
double cuda_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0; // dOutput x W^T -> dInput and
if (nActive) { // Input^T x dOutput -> dW
auto iF = input_features.data<T>(); // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
auto oF = output_features.data<T>(); template <typename T, Int K, Int V>
Int ip = input_features.size(1); __global__ void
Int op = output_features.size(1); dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
auto w = weight.data<T>(); T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
if (bias.numel()) { Int output_nPlanes, Int output_stride) {
auto b = bias.data<T>(); // M = gridDim.y == input_nPlanes / K
for (Int i = 0; i < op; i += 32) { Int N = (output_nPlanes + K - 1) / K;
Int blockDim = min((Int)32, op - i); Int m = blockIdx.y;
Int gridDim = min((Int)4096, nActive); inFeatures += m * K;
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op, dInFeatures += m * K;
nActive); w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
} }
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads();
} }
Int c = ip * op; #pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op); (dConvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} else {
RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
} }
return flops; return flops;
} }
template <typename T, Int Dimension> template <typename T>
void cuda_RandomizedStrideConvolution_backward( void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, T *w, T *dw, RuleBook _rules, Int input_nPlanes,
/*long*/ at::Tensor filterSize, Int input_stride, Int output_nPlanes,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m, Int output_stride) {
/*cuda float*/ at::Tensor input_features, Int c = input_nPlanes * output_nPlanes;
/*cuda float*/ at::Tensor d_input_features, if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
/*cuda float*/ at::Tensor d_output_features, const int K = 16;
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight, const int V = 4;
/*cuda float*/ at::Tensor d_bias) { RULEBOOKITERATOR(
(dConvolution_KMxKN_backward_dW2<
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, T, K,
filterStride, true); V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
Int nActive = m.getNActive(outputSize); inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
d_input_features.resize_as_(input_features); input_nPlanes, input_stride, output_nPlanes, output_stride));
d_input_features.zero_(); , w += c; dw += c;)
} else {
if (nActive) { RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
auto iF = input_features.data<T>(); dOutFeatures, w, dw, rbB, nHotB,
auto diF = d_input_features.data<T>(); input_nPlanes, input_stride,
auto doF = d_output_features.data<T>(); output_nPlanes, output_stride);
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;) , w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
} }
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_CONVOLUTION_H
#define CUDA_CONVOLUTION_H
template <typename T>
__global__ void Convolution_fp_bias(T *output_features, T *bias, Int nPlanes,
Int output_stride, Int nActive) {
__shared__ T b[32];
b[threadIdx.x] = bias[threadIdx.x];
for (Int row = blockIdx.x; row < nActive; row += 1 << 12) {
output_features[row * output_stride + threadIdx.x] = b[threadIdx.x];
}
}
template <typename T>
__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
Int i = blockIdx.x * 32 + threadIdx.x;
T t = 0;
for (Int j = blockIdx.y; j < nRows; j += 32)
t += matrix[j * nCOLUMNS + i];
atomicAdd(&target[i], t);
}
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
if (nColumns / 32 > 0)
dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(
matrix, target, nRows, nColumns, nCOLUMNS);
if (nColumns % 32 > 0) {
Int o = nColumns / 32 * 32;
dColumnSum<<<dim3(1, 32), nColumns - o>>>(
matrix + o, target + o, nRows, nColumns, nCOLUMNS);
}
}
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
#undef FOO
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
Int KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T>
void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dConvolution_KMxKN_forward2<
T, K, V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
return;
} else {
dConvolution_forward(inFeatures, outFeatures, w, rules, nHot, input_nPlanes,
input_stride, output_nPlanes, output_stride);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dConvolution_KMxKN_backward_dW2<T, K, V><<<
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
return;
} else {
dConvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
}
}
#endif /* CUDA_CONVOLUTION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride);
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride);
template <typename T, Int Dimension>
double cuda_Deconvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
output_features.resize_({nActiveOut, op});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto w = weight.data<T>();
if (bias.numel())
Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
else
output_features.zero_();
return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
} else {
return 0;
}
}
template <typename T, Int Dimension>
void cuda_Deconvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActiveIn = m.getNActive(inputSize);
Int nActiveOut = m.getNActive(outputSize);
if (nActiveOut) {
Int ip = weight.size(1);
Int op = weight.size(2);
d_input_features.resize_({nActiveIn, ip});
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
}
}
}
...@@ -4,80 +4,600 @@ ...@@ -4,80 +4,600 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "Convolution.h" template <typename T, Int K, Int V>
#include "Deconvolution.h" __global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int Dimension> Int nHot, Int input_nPlanes, Int input_stride,
double cuda_Deconvolution_updateOutput( Int output_nPlanes, Int output_stride) {
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, // nHot must be a multiple of K!!
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m, // Input x Weight -> Output
/*cuda float*/ at::Tensor input_features, // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight, // K is a multiple of V,
/*cuda float*/ at::Tensor bias) {
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true); Int M = input_nPlanes / K;
Int nActive = m.getNActive(outputSize); // N = gridDim.y == output_nPlanes/K
output_features.resize_({nActive, weight.size(2)}); Int n = blockIdx.y;
if (not bias.numel()) outFeatures += n * K;
output_features.zero_(); w += n * K;
auto iF = input_features.data<T>(); T O[V];
auto oF = output_features.data<T>(); __shared__ T W[K][K];
Int ip = input_features.size(1); __shared__ T I[K][K];
Int op = output_features.size(1); Int R0[V];
auto w = weight.data<T>(); Int R1[V];
double flops = 0; const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
if (bias.numel()) { // Read input, reset O[]
auto b = bias.data<T>(); #pragma unroll
for (Int i = 0; i < op; i += 32) { for (int v = 0; v < V; v++) {
Int blockDim = min((Int)32, op - i); if (s + ty[v] < nHot)
Int gridDim = min((Int)4096, nActive); I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op, O[v] = 0;
nActive); }
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
} }
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
#undef FOO
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
Int KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR(
(dDeconvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;)
} else {
RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
} }
Int c = ip * op;
RULEBOOKITERATOR(
dDeconvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
return flops; return flops;
} }
template <typename T, Int Dimension> template <typename T>
void cuda_Deconvolution_backward( void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, T *w, T *dw, RuleBook _rules,
/*long*/ at::Tensor filterSize, Int input_nPlanes, Int input_stride,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m, Int output_nPlanes, Int output_stride) {
/*cuda float*/ at::Tensor input_features, Int c = input_nPlanes * output_nPlanes;
/*cuda float*/ at::Tensor d_input_features, if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
/*cuda float*/ at::Tensor d_output_features, const int K = 16;
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight, const int V = 4;
/*cuda float*/ at::Tensor d_bias) { RULEBOOKITERATOR(
(dDeconvolution_KMxKN_backward_dW2<
auto _rules = T, K,
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true); V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
Int nActive = m.getNActive(outputSize); inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
d_input_features.resize_as_(input_features); input_nPlanes, input_stride, output_nPlanes, output_stride));
d_input_features.zero_(); , w += c; dw += c;)
} else {
auto iF = input_features.data<T>(); RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
auto diF = d_input_features.data<T>(); dOutFeatures, w, dw, rbB, nHotB,
auto doF = d_output_features.data<T>(); input_nPlanes, input_stride,
Int ip = input_features.size(1); output_nPlanes, output_stride);
Int op = d_output_features.size(1); , w += c; dw += c;)
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
} }
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_DECONVOLUTION_H
#define CUDA_DECONVOLUTION_H
#include "Convolution.h"
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
#undef FOO
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
Int KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T>
void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_forward2<T, K, V><<<
dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
return;
} else {
dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
input_nPlanes, input_stride, output_nPlanes,
output_stride);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW2(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_backward_dW2<T, K, V><<<
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
return;
} else {
dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
}
}
#endif /* CUDA_DECONVOLUTION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void InputLayer_fp(T *input_features, T *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
bool average);
template <typename T>
void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
bool average);
template <typename T, Int Dimension>
void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long batchSize, long mode) {
m.inputLayer(spatialSize, input_coords, batchSize, mode);
Int nPlanes = input_features.size(1);
auto &rules = m.inputLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_InputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
false);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
Int nPlanes = input_features.size(2);
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto &rules = m.blLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(1);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateOutput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(2);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({nRows, nPlanes});
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
false);
}
}
...@@ -4,241 +4,67 @@ ...@@ -4,241 +4,67 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "IOLayers.h" // Rulebook Format
// rules[0][0] == mode
// rules[0][1] == maxActive per spatial location (==1 for modes 0,1,2)
// rules[0][2] == nInputRows
// rules[0][3] == nOutputRows
// rules[1] nOutputRows x (1+maxActive)
template <typename T, Int Dimension> template <typename T>
void cuda_InputLayer_updateOutput(Metadata<Dimension> &m, __global__ void InputLayer_fp_(T *input_features, T *output_features, Int nRows,
/*long*/ at::Tensor spatialSize, Int maxActive, Int nPlanes, Int *rules,
/*long*/ at::Tensor input_coords, bool average) {
/*cuda float*/ at::Tensor input_features, for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
/*cuda float*/ at::Tensor output_features, T *out = output_features + row * nPlanes;
long batchSize, long mode) { Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
m.inputLayer(spatialSize, input_coords, batchSize, mode); T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
Int nPlanes = input_features.size(1); for (int i = 1; i <= nActive; i++) {
auto &rules = m.inputLayerRuleBook; T *inp = input_features + r[i] * nPlanes;
Int maxActive = rules[0][1]; for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
Int nRows = rules[0][3]; out[plane] += multiplier * inp[plane];
if (mode == 0) { }
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_InputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
Int nPlanes = input_features.size(2);
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto &rules = m.blLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
} }
} }
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(1);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) { template <typename T>
d_input_features.resize_as_(d_output_features); void InputLayer_fp(T *input_features, T *output_features, Int nRows,
d_input_features.copy_(d_output_features); Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes}); bool average) {
} else { cudaMemcpy(rules_gpu, rules_cpu, sizeof(Int) * nRows * (1 + maxActive),
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes}); cudaMemcpyHostToDevice);
d_input_features.zero_(); InputLayer_fp_<
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()}); T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
auto diF = d_input_features.data<T>(); input_features, output_features, nRows, maxActive, nPlanes, rules_gpu,
auto doF = d_output_features.data<T>(); average);
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
} }
template <typename T, Int Dimension> template <typename T>
void cuda_BLOutputLayer_updateOutput( __global__ void InputLayer_bp_(T *d_input_features, T *d_output_features,
Metadata<Dimension> &m, Int nRows, Int maxActive, Int nPlanes,
/*cuda float*/ at::Tensor input_features, Int *rules, bool average) {
/*cuda float*/ at::Tensor output_features) { for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = d_output_features + row * nPlanes;
auto &rules = m.blLayerRuleBook; Int *r = rules + row * (1 + maxActive);
Int nPlanes = input_features.size(1); Int nActive = r[0];
auto mode = rules[0][0]; T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
Int maxActive = rules[0][1]; for (int i = 1; i <= nActive; i++) {
Int nRows = rules[0][4]; T *inp = d_input_features + r[i] * nPlanes;
if (mode == 0) { for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
output_features.resize_as_(input_features); atomicAdd(&inp[plane], multiplier * out[plane]);
output_features.copy_(input_features); }
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
} }
} }
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook; template <typename T>
Int nPlanes = d_output_features.size(2); void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
Int mode = rules[0][0]; Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
Int maxActive = rules[0][1]; bool average) {
Int nRows = rules[0][4]; cudaMemcpy(rules_gpu, rules_cpu, sizeof(Int) * nRows * (1 + maxActive),
if (mode == 0) { cudaMemcpyHostToDevice);
d_input_features.resize_as_(d_output_features); InputLayer_bp_<
d_input_features.copy_(d_output_features); T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
d_input_features.resize_({nRows, nPlanes}); d_input_features, d_output_features, nRows, maxActive, nPlanes, rules_gpu,
} else { average);
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_IOLAYERS_H
#define CUDA_IOLAYERS_H
template <typename T>
__global__ void InputLayer_fp(T *input_features, T *output_features,
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = output_features + row * nPlanes;
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = input_features + r[i] * nPlanes;
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
out[plane] += multiplier * inp[plane];
}
}
}
template <typename T>
__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = d_output_features + row * nPlanes;
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = d_input_features + r[i] * nPlanes;
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
atomicAdd(&inp[plane], multiplier * out[plane]);
}
}
}
#endif /* CUDA_IOLAYERS_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha);
template <typename T>
void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
Int n, T alpha);
template <typename T>
void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
T alpha) {
output_features.resize_as_(input_features);
auto n = input_features.numel();
LeakyReLU_fp<T>(input_features.data<T>(), output_features.data<T>(), n,
alpha);
}
template <typename T>
void cuda_LeakyReLU_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, T alpha) {
d_input_features.resize_as_(d_output_features);
auto n = d_input_features.numel();
LeakyReLU_bp<T>(input_features.data<T>(), d_input_features.data<T>(),
d_output_features.data<T>(), n, alpha);
}
...@@ -4,26 +4,28 @@ ...@@ -4,26 +4,28 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template <typename T> template <typename T>
void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features, __global__ void LeakyReLU_fp_(T *input_features, T *output_features, Int n,
/*cuda float*/ at::Tensor output_features, T alpha) {
float alpha) { for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features.resize_as_(input_features); output_features[i] = (input_features[i] > 0) ? input_features[i]
auto n = input_features.numel(); : (input_features[i] * alpha);
LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(), }
output_features.data<T>(), n, alpha); template <typename T>
void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha) {
LeakyReLU_fp_<T><<<16, 1024>>>(input_features, output_features, n, alpha);
}
template <typename T>
__global__ void LeakyReLU_bp_(T *input_features, T *d_input_features,
T *d_output_features, Int n, T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
} }
template <typename T> template <typename T>
void cuda_LeakyReLU_updateGradInput( void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
/*cuda float*/ at::Tensor input_features, Int n, T alpha) {
/*cuda float*/ at::Tensor d_input_features, LeakyReLU_bp_<T><<<16, 1024>>>(input_features, d_input_features,
/*cuda float*/ at::Tensor d_output_features, float alpha) { output_features, n, alpha);
d_input_features.resize_as_(d_output_features);
auto n = d_input_features.numel();
LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
d_input_features.data<T>(),
d_output_features.data<T>(), n, alpha);
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template <typename T>
__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features[i] = (input_features[i] > 0) ? input_features[i]
: (input_features[i] * alpha);
}
template <typename T>
__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
T *d_output_features, Int n, T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, RuleBook _rules);
template <typename T>
void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, RuleBook _rules);
template <typename T, Int Dimension>
void cuda_MaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, nPlanes});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), _rules);
}
template <typename T, Int Dimension>
void cuda_MaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
cuda_MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
input_features.size(1),
d_output_features.size(1), _rules);
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideMaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, nPlanes});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), _rules);
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideMaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
cuda_MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
input_features.size(1),
d_output_features.size(1), _rules);
}
...@@ -4,100 +4,74 @@ ...@@ -4,100 +4,74 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h" #include "RuleBookIterator.h"
template <typename T, Int Dimension> // NTX must be >=2 so r is filled properly
void cuda_MaxPooling_updateOutput( template <typename T, Int NTX, Int NTY>
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, __global__ void MaxPooling_fp(T *input_features, T *output_features,
/*long*/ at::Tensor poolSize, Int nPlanes, Int input_stride, Int output_stride,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m, Int *rules, Int nHot) {
/*cuda float*/ at::Tensor input_features, __shared__ Int r[NTY * 2];
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) { for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int nPlanes = input_features.size(1) - nFeaturesToDrop; Int i = threadIdx.x + NTX * threadIdx.y;
auto _rules = if (i < NTY * 2 and i < 2 * (nHot - n))
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true); r[i] = rules[2 * n + i];
Int nActive = m.getNActive(outputSize); }
output_features.resize_({nActive, nPlanes}); __syncthreads();
output_features.zero_(); if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
auto iF = input_features.data<T>() + nFeaturesToDrop; Int o = r[2 * threadIdx.y + 1] * output_stride;
auto oF = output_features.data<T>(); for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
RULEBOOKITERATOR( T inp = input_features[i + plane];
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1), if (output_features[o + plane] < inp)
output_features.size(1), rbB, nHotB); output_features[o + plane] = inp;
, ) }
}
__syncthreads();
}
} }
template <typename T, Int Dimension>
void cuda_MaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>(); template <typename T>
auto oF = output_features.data<T>(); void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
auto diF = d_input_features.data<T>(); Int nPlanes, Int input_stride,
auto doF = d_output_features.data<T>(); Int output_stride, RuleBook _rules) {
RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>( RULEBOOKITERATOR((MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
iF, diF, oF, doF, nPlanes, input_features.size(1), input_features, output_features, nPlanes, input_stride, output_stride,
d_output_features.size(1), rbB, nHotB); rbB, nHotB));
, ) , )
} }
template <typename T, Int Dimension> template <typename T, Int NTX, Int NTY>
void cuda_RandomizedStrideMaxPooling_updateOutput( __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize, T *output_features, T *d_output_features,
/*long*/ at::Tensor poolSize, Int nPlanes, Int input_stride, Int output_stride,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m, Int *rules, Int nHot) {
/*cuda float*/ at::Tensor input_features, __shared__ Int r[NTY * 2];
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) { for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int nPlanes = input_features.size(1) - nFeaturesToDrop; Int i = threadIdx.x + NTX * threadIdx.y;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, if (i < NTY * 2 and i < 2 * (nHot - n))
poolStride, true); r[i] = rules[2 * n + i];
Int nActive = m.getNActive(outputSize); }
output_features.resize_({nActive, nPlanes}); __syncthreads();
output_features.zero_(); if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
auto iF = input_features.data<T>() + nFeaturesToDrop; Int o = r[2 * threadIdx.y + 1] * output_stride;
auto oF = output_features.data<T>(); for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
RULEBOOKITERATOR( if (output_features[o + plane] == input_features[i + plane])
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1), d_input_features[i + plane] += d_output_features[o + plane];
output_features.size(1), rbB, nHotB); }
, ) __syncthreads();
}
} }
template <typename T, Int Dimension>
void cuda_RandomizedStrideMaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>(); template <typename T>
auto oF = output_features.data<T>(); void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
auto diF = d_input_features.data<T>(); T *output_features, T *d_output_features,
auto doF = d_output_features.data<T>(); Int nPlanes, Int input_stride,
RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>( Int output_stride, RuleBook _rules) {
iF, diF, oF, doF, nPlanes, input_features.size(1), RULEBOOKITERATOR((MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_output_features.size(1), rbB, nHotB); input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rbB, nHotB));
, ) , )
} }
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_MAXPOOLING_H
#define CUDA_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_fp(T *input_features, T *output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
T inp = input_features[i + plane];
if (output_features[o + plane] < inp)
output_features[o + plane] = inp;
}
}
__syncthreads();
}
}
template <typename T>
void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rules, nHot);
}
#endif /* CUDA_MAXPOOLING_H */
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm> #include <algorithm>
template <typename T> template <typename T>
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template <typename T>
void cuda_SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume,
RuleBook _rules);
template <typename T>
void cuda_SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume,
RuleBook _rules);
template <typename T, Int Dimension>
void cuda_SparseToDense_updateOutput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nPlanes) {
{
std::array<long, Dimension + 2> sz;
sz[0] = m.grids.begin()->second.size(); // batch size
sz[1] = nPlanes;
long *in_sz = inputSize.data<long>();
for (Int i = 0; i < Dimension; ++i)
sz[i + 2] = in_sz[i];
output_features.resize_(sz);
output_features.zero_();
}
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
Int _nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
long spatialVolume = inputSize.prod().data<long>()[0];
cuda_SparseToDense_ForwardPass<T>(iF, oF, _nPlanes, spatialVolume, _rules);
}
}
template <typename T, Int Dimension>
void cuda_SparseToDense_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
long spatialVolume = inputSize.prod().data<long>()[0];
Int _nPlanes = d_input_features.size(1);
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
cuda_SparseToDense_BackwardPass<T>(diF, doF, _nPlanes, spatialVolume,
_rules);
}
}
...@@ -4,53 +4,66 @@ ...@@ -4,53 +4,66 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "SparseToDense.h" // NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
template <typename T, Int Dimension> __global__ void SparseToDense_fp(T *input_features, T *output_features,
void cuda_SparseToDense_updateOutput( Int nPlanes, Int spatialVolume, Int *rules,
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m, Int nHot) {
/*cuda float*/ at::Tensor input_features, __shared__ Int r[NTY * 2];
/*cuda float*/ at::Tensor output_features, long nPlanes) { for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
{ Int i = threadIdx.x + NTX * threadIdx.y;
std::array<long, Dimension + 2> sz; if (i < NTY * 2 and i < 2 * (nHot - n))
sz[0] = m.grids.begin()->second.size(); // batch size r[i] = rules[2 * n + i];
sz[1] = nPlanes; }
long *in_sz = inputSize.data<long>(); __syncthreads();
for (Int i = 0; i < Dimension; ++i) if (n + threadIdx.y < nHot) {
sz[i + 2] = in_sz[i]; T *i = input_features + r[2 * threadIdx.y] * nPlanes;
output_features.resize_(sz); T *o = output_features + r[2 * threadIdx.y + 1];
output_features.zero_(); for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
} o[plane * spatialVolume] = i[plane];
if (input_features.ndimension() == 2) { }
auto _rules = m.getSparseToDenseRuleBook(inputSize, true); __syncthreads();
Int _nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
long spatialVolume = inputSize.prod().data<long>()[0];
RULEBOOKITERATOR(SparseToDense_ForwardPass<T>( iF, oF, _nPlanes,
spatialVolume, rbB, nHotB);
, oF += _nPlanes * spatialVolume;)
} }
} }
template <typename T, Int Dimension>
void cuda_SparseToDense_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
d_input_features.resize_as_(input_features); template <typename T>
d_input_features.zero_(); void cuda_SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume,
RuleBook _rules) {
RULEBOOKITERATOR((SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, spatialVolume, rbB, nHotB));
, output_features += nPlanes * spatialVolume;)
}
if (input_features.ndimension() == 2) { // NTX must be >=2 so r is filled properly
auto _rules = m.getSparseToDenseRuleBook(inputSize, true); template <typename T, Int NTX, Int NTY>
long spatialVolume = inputSize.prod().data<long>()[0]; __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
Int _nPlanes = d_input_features.size(1); Int nPlanes, Int spatialVolume, Int *rules,
auto diF = d_input_features.data<T>(); Int nHot) {
auto doF = d_output_features.data<T>(); __shared__ Int r[NTY * 2];
RULEBOOKITERATOR(SparseToDense_BackwardPass<T>( diF, doF, _nPlanes, for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
spatialVolume, rbB, nHotB); {
, doF += _nPlanes * spatialVolume;) Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
T *d_o = d_output_features + r[2 * threadIdx.y + 1];
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_i[plane] = d_o[plane * spatialVolume];
}
__syncthreads();
} }
} }
template <typename T>
void cuda_SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume,
RuleBook _rules) {
RULEBOOKITERATOR((SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, spatialVolume, rbB, nHotB));
, d_output_features += nPlanes * spatialVolume;)
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void SparseToDense_fp(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *i = input_features + r[2 * threadIdx.y] * nPlanes;
T *o = output_features + r[2 * threadIdx.y + 1];
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
o[plane * spatialVolume] = i[plane];
}
__syncthreads();
}
}
template <typename T>
void SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, spatialVolume, rules, nHot);
}
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
T *d_o = d_output_features + r[2 * threadIdx.y + 1];
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_i[plane] = d_o[plane * spatialVolume];
}
__syncthreads();
}
}
template <typename T>
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot);
}
#endif /* CUDA_SPARSETODENSE_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment