Commit 66986767 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

fixes

parent edf89af3
...@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows, ...@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
for (row = 0; row < nRows; row++) { for (row = 0; row < nRows; row++) {
auto nActive = rules[0]; auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1; T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
auto out_f = output_features + row * nPlanes;
auto r = rules + row * (1 + maxActive);
for (Int i = 1; i <= nActive; ++i) { for (Int i = 1; i <= nActive; ++i) {
auto in_f = input_features + nPlanes * rules[i]; auto in_f = input_features + r[i] * nPlanes;
for (Int plane = 0; plane < nPlanes; plane++) { for (Int plane = 0; plane < nPlanes; plane++) {
output_features[plane] += multiplier * in_f[plane]; out_f[plane] += multiplier * in_f[plane];
} }
} }
output_features += nPlanes;
rules += 1 + maxActive;
} }
} }
template <typename T> template <typename T>
...@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features, ...@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
for (row = 0; row < nRows; row++) { for (row = 0; row < nRows; row++) {
auto nActive = rules[0]; auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1; T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
auto d_out_f = d_output_features + row * nPlanes;
auto r = rules + row * (1 + maxActive);
for (Int i = 1; i <= nActive; ++i) { for (Int i = 1; i <= nActive; ++i) {
auto d_in_f = d_input_features + nPlanes * rules[i]; auto d_in_f = d_input_features + r[i] * nPlanes;
for (Int plane = 0; plane < nPlanes; plane++) for (Int plane = 0; plane < nPlanes; plane++)
d_in_f[plane] += multiplier * d_output_features[plane]; d_in_f[plane] += multiplier * d_out_f[plane];
} }
d_output_features += nPlanes;
rules += 1 + maxActive;
} }
} }
......
...@@ -5,10 +5,9 @@ ...@@ -5,10 +5,9 @@
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
template <typename T> template <typename T>
void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut); void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive);
template <typename T> template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns, void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive);
Int nCOLUMNS);
template <typename T> template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
...@@ -84,7 +83,7 @@ void cuda_Convolution_backward( ...@@ -84,7 +83,7 @@ void cuda_Convolution_backward(
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut); Convolution_bp_bias(doF, db, op, nActiveOut);
} }
} }
} }
...@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward( ...@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward(
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive); Convolution_bp_bias(doF, db, op, nActive);
} }
} }
} }
...@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward( ...@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward(
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut); Convolution_bp_bias(doF, db, op, nActiveOut);
} }
} }
} }
...@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward( ...@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward(
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut); Convolution_bp_bias(doF, db, op, nActiveOut);
} }
} }
} }
...@@ -5,10 +5,11 @@ ...@@ -5,10 +5,11 @@
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h" #include "RuleBookIterator.h"
#define TACC double
template <typename T> template <typename T>
__global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes, __global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes,
Int nActive) { Int nActive) {
Int n = blockIdx.x * 32 + threadIdx.x; Int n = blockIdx.x * 32 + threadIdx.x;
T b = bias[n]; T b = bias[n];
output_features += n; output_features += n;
...@@ -21,41 +22,38 @@ template <typename T> ...@@ -21,41 +22,38 @@ template <typename T>
void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) { void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) {
if (nPlanes / 32 > 0) if (nPlanes / 32 > 0)
Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes, Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes,
nActive); nActive);
if (nPlanes % 32 > 0) { if (nPlanes % 32 > 0) {
Int o = nPlanes / 32 * 32; Int o = nPlanes / 32 * 32;
Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes, Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes,
nActive); nActive);
} }
} }
template <typename T> template <typename T>
__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns, __global__ void Convolution_bp_bias_(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
Int nCOLUMNS) { Int n = blockIdx.x * 32 + threadIdx.x;
Int i = blockIdx.x * 32 + threadIdx.x; d_oF+=n;
T t = 0; TACC t = 0;
for (Int j = blockIdx.y; j < nRows; j += 32) for (Int row = blockIdx.y; row < nActive; row += gridDim.y)
t += matrix[j * nCOLUMNS + i]; t += d_oF[row * nPlanes ];
atomicAdd(&target[i], t); atomicAdd(&d_b[n], t);
} }
template <typename T> template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns, void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
Int nCOLUMNS) { if (nPlanes / 32 > 0)
if (nColumns / 32 > 0) Convolution_bp_bias_<<<dim3(nPlanes / 32, 32), 32>>>(d_oF, d_b, nPlanes, nActive);
dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(matrix, target, nRows, nColumns, if (nPlanes % 32 > 0) {
nCOLUMNS); Int o = nPlanes / 32 * 32;
if (nColumns % 32 > 0) { Convolution_bp_bias_<<<dim3(1, 32), nPlanes - o>>>(d_oF + o, d_b + o, nPlanes, nActive);
Int o = nColumns / 32 * 32;
dColumnSum<<<dim3(1, 32), nColumns - o>>>(matrix + o, target + o, nRows,
nColumns, nCOLUMNS);
} }
} }
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!! // nHot must be a multiple of K!!
// Input x Weight -> Output // Input x Weight -> Output
...@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K; outFeatures += n * K;
w += n * K; w += n * K;
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
Int R0[V]; Int R0[V];
...@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])]; R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1]; R1[v] = rules[2 * (s + ty[v]) + 1];
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx]; O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v]; outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
...@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K; outFeatures += n * K;
w += n * K; w += n * K;
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
Int R0[V]; Int R0[V];
...@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])]; R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1]; R1[v] = rules[2 * (s + ty[v]) + 1];
} }
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx]; O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v]; outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dConvolution_KMxKN_forwardA< \ dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \ dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \ input_nPlanes, input_stride, \
output_nPlanes, output_stride); \ output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_forwardB< \ dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \ inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \ return; \
} \ } \
} }
template <typename T> template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16) FOO(T, 64, 16)
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
...@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
} }
template <> template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures, void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot, double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8) FOO(double, 32, 8)
FOO(double, 16, 4) FOO(double, 16, 4)
FOO(double, 8, 2) FOO(double, 8, 2)
...@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures, ...@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
...@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w += m * K * output_nPlanes; w += m * K * output_nPlanes;
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])]; R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1]; R1[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx]; dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v]; dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
...@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K; Int N = output_nPlanes / K;
Int m = blockIdx.y; Int m = blockIdx.y;
...@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w += m * K * output_nPlanes; w += m * K * output_nPlanes;
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])]; R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1]; R1[v] = rules[2 * (s + ty[v]) + 1];
} }
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else { } else {
I[ty[v]][tx] = 0; I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0; dO[ty[v]][tx] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx]; dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v]; dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
...@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \ dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \ dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \ dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \ nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \ output_stride); \
return; \ return; \
} \ } \
} }
template <typename T> template <typename T>
void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
FOO(T, 8, 2) FOO(T, 8, 2)
...@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
...@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w += n * K; w += n * K;
Int KO = min(K, output_nPlanes - K * n); Int KO = min(K, output_nPlanes - K * n);
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ Int R[K * 2]; __shared__ Int R[K * 2];
...@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs // Read rules for K input/output pairs
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < 2) { if (ty[v] < 2) {
int q = ty[v] * K + tx; int q = ty[v] * K + tx;
if (s + q / 2 < nHot) if (s + q / 2 < nHot)
R[q] = rules[2 * s + q]; R[q] = rules[2 * s + q];
} }
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < KI; k++) for (int k = 0; k < KI; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot) if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v]; outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K; Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y; Int m = blockIdx.y;
...@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m); Int KI = min(K, input_nPlanes - K * m);
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0; dW[v] = 0;
} }
...@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// Read rules for K input/output pairs, reset dI[] // Read rules for K input/output pairs, reset dI[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < 2) { if (ty[v] < 2) {
int q = ty[v] * K + tx; int q = ty[v] * K + tx;
if (s + q / 2 < nHot) if (s + q / 2 < nHot)
R[q] = rules[2 * s + q]; R[q] = rules[2 * s + q];
} }
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else else
I[ty[v]][tx] = 0; I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot) if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else else
dO[ty[v]][tx] = 0; dO[ty[v]][tx] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < KO; k++) for (int k = 0; k < KO; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v]; dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]); atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K; w += K;
dw += K; dw += K;
dOutFeatures += K; dOutFeatures += K;
...@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T> template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes;
double flops = 0; double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dConvolution_KMxKN_forward2< (dConvolution_KMxKN_forward2<
T, K, T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride, inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride)); output_nPlanes, output_stride));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} else { } else {
RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB, RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride);
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} }
return flops; return flops;
} }
template <typename T> template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes, T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dConvolution_KMxKN_backward_dW2< (dConvolution_KMxKN_backward_dW2<
T, K, T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB, inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride)); input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;) , w += c; dw += c;)
} else { } else {
RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures, RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride);
, w += c; dw += c;) , w += c; dw += c;)
} }
} }
#undef TACC
\ No newline at end of file
...@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward( ...@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward(
dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op); dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) { if (d_bias.numel()) {
auto db = d_bias.data<T>(); auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut); Convolution_bp_bias(doF, db, op, nActiveOut);
} }
} }
} }
...@@ -4,11 +4,13 @@ ...@@ -4,11 +4,13 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#define TACC double
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!! // nHot must be a multiple of K!!
// Input x Weight -> Output // Input x Weight -> Output
...@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K; outFeatures += n * K;
w += n * K; w += n * K;
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
Int R0[V]; Int R0[V];
...@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])]; R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1]; R0[v] = rules[2 * (s + ty[v]) + 1];
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx]; O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v]; outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
...@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K; outFeatures += n * K;
w += n * K; w += n * K;
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
Int R0[V]; Int R0[V];
...@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])]; R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1]; R0[v] = rules[2 * (s + ty[v]) + 1];
} }
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx]; O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v]; outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dDeconvolution_KMxKN_forwardA< \ dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \ dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \ input_nPlanes, input_stride, \
output_nPlanes, output_stride); \ output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \ dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \ inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \ return; \
} \ } \
} }
template <typename T> template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16) FOO(T, 64, 16)
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
...@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
} }
template <> template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures, void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot, double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8) FOO(double, 32, 8)
FOO(double, 16, 4) FOO(double, 16, 4)
FOO(double, 8, 2) FOO(double, 8, 2)
...@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A( ...@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
w += m * K * output_nPlanes; w += m * K * output_nPlanes;
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A( ...@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])]; R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1]; R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx]; dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v]; dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
...@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
w += m * K * output_nPlanes; w += m * K * output_nPlanes;
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])]; R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1]; R0[v] = rules[2 * (s + ty[v]) + 1];
} }
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) { if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else { } else {
I[ty[v]][tx] = 0; I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0; dO[ty[v]][tx] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx]; dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v]; dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
...@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B( ...@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \ Int o = (nHot / K) * K; \
if (o >= K) \ if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \ dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \ T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \ dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \ input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \ if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \ dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \ T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \ nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \ output_stride); \
return; \ return; \
} \ } \
} }
template <typename T> template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot, T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8) FOO(T, 32, 8)
FOO(T, 16, 4) FOO(T, 16, 4)
FOO(T, 8, 2) FOO(T, 8, 2)
...@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures, ...@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output // Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V, // K is a multiple of V,
...@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w += n * K; w += n * K;
Int KO = min(K, output_nPlanes - K * n); Int KO = min(K, output_nPlanes - K * n);
T O[V]; TACC O[V];
__shared__ T W[K][K]; __shared__ T W[K][K];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ Int R[K * 2]; __shared__ Int R[K * 2];
...@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs // Read rules for K input/output pairs
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < 2) { if (ty[v] < 2) {
int q = ty[v] * K + tx; int q = ty[v] * K + tx;
if (s + q / 2 < nHot) if (s + q / 2 < nHot)
R[q] = rules[2 * s + q]; R[q] = rules[2 * s + q];
} }
} }
__syncthreads(); __syncthreads();
// Read input, reset O[] // Read input, reset O[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0; O[v] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < KI; k++) for (int k = 0; k < KI; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx]; O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot) if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v]; outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
__syncthreads(); __syncthreads();
} }
w += K * output_nPlanes; w += K * output_nPlanes;
...@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules, ...@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V> template <typename T, Int K, Int V>
__global__ void __global__ void
dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, Int *rules, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K // M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K; Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y; Int m = blockIdx.y;
...@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, ...@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
dw += m * K * output_nPlanes; dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m); Int KI = min(K, input_nPlanes - K * m);
T dI[V]; TACC dI[V];
T dW[V]; TACC dW[V];
__shared__ T I[K][K]; __shared__ T I[K][K];
__shared__ T dO[K][K]; __shared__ T dO[K][K];
__shared__ T W[K][K]; __shared__ T W[K][K];
...@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, ...@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0; dW[v] = 0;
} }
...@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, ...@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
// Read rules for K input/output pairs, reset dI[] // Read rules for K input/output pairs, reset dI[]
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (ty[v] < 2) { if (ty[v] < 2) {
int q = ty[v] * K + tx; int q = ty[v] * K + tx;
if (s + q / 2 < nHot) if (s + q / 2 < nHot)
R[q] = rules[2 * s + q]; R[q] = rules[2 * s + q];
} }
dI[v] = 0; dI[v] = 0;
} }
__syncthreads(); __syncthreads();
// Read input and dOutput // Read input and dOutput
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) { for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx]; I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else else
I[ty[v]][tx] = 0; I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot) if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx]; dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else else
dO[ty[v]][tx] = 0; dO[ty[v]][tx] = 0;
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int k = 0; k < KO; k++) for (int k = 0; k < KO; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k]; dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll #pragma unroll
for (int k = 0; k < K; k++) for (int k = 0; k < K; k++)
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx]; dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot) if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v]; dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
__syncthreads(); __syncthreads();
} }
#pragma unroll #pragma unroll
for (int v = 0; v < V; v++) for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO) if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]); atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K; w += K;
dw += K; dw += K;
dOutFeatures += K; dOutFeatures += K;
...@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, ...@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
template <typename T> template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) { Int output_stride) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes;
double flops = 0; double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dDeconvolution_KMxKN_forward2< (dDeconvolution_KMxKN_forward2<
T, K, T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride, inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride)); output_nPlanes, output_stride));
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} else { } else {
RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB, RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride);
, w += c; flops += nHotB * c;) , w += c; flops += nHotB * c;)
} }
return flops; return flops;
} }
template <typename T> template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures, void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) { Int output_nPlanes, Int output_stride) {
Int c = input_nPlanes * output_nPlanes; Int c = input_nPlanes * output_nPlanes;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16; const int K = 16;
const int V = 4; const int V = 4;
RULEBOOKITERATOR( RULEBOOKITERATOR(
(dDeconvolution_KMxKN_backward_dW2< (dDeconvolution_KMxKN_backward_dW2<
T, K, T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>( V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB, inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride)); input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;) , w += c; dw += c;)
} else { } else {
RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures, RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, input_nPlanes, input_stride,
output_nPlanes, output_stride); output_nPlanes, output_stride);
, w += c; dw += c;) , w += c; dw += c;)
} }
} }
#undef TACC
\ No newline at end of file
...@@ -15,15 +15,15 @@ ...@@ -15,15 +15,15 @@
#include "CUDA/UnPooling.cu" #include "CUDA/UnPooling.cu"
template void ActivePooling_ForwardPass<float>(float *input_features, template void ActivePooling_ForwardPass<float>(float *input_features,
float *output_features, float *output_features,
Int batchSize, Int maxActive, Int batchSize, Int maxActive,
Int nPlanes, Int *rules, Int nPlanes, Int *rules,
bool average); bool average);
template void ActivePooling_BackwardPass<float>(float *d_input_features, template void ActivePooling_BackwardPass<float>(float *d_input_features,
float *d_output_features, float *d_output_features,
Int batchSize, Int maxActive, Int batchSize, Int maxActive,
Int nPlanes, Int *rules, Int nPlanes, Int *rules,
bool average); bool average);
template void dAffineReluTrivialConvolution_forward<float>( template void dAffineReluTrivialConvolution_forward<float>(
float *inFeatures, float *outFeatures, float *affineWeight, float *inFeatures, float *outFeatures, float *affineWeight,
...@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>( ...@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>(
float *d_input_features, float *d_output_features, Int nPlanes, float *d_input_features, float *d_output_features, Int nPlanes,
Int input_stride, Int output_stride, RuleBook _rules, Int filterVolume); Int input_stride, Int output_stride, RuleBook _rules, Int filterVolume);
template void Convolution_fp_bias<float>(float *of, float *b, Int op, template void Convolution_fp_bias<float>(float *oF, float *b, Int nPlanes,
Int nActive); Int nActive);
template void Convolution_bp_bias<float>(float *matrix, float *target, template void Convolution_bp_bias<float>(float *d_oF, float *d_b,
Int nRows, Int nColumns, Int nCOLUMNS); Int nPlanes, Int nActive);
template double dConvolution_forward2<float>( template double dConvolution_forward2<float>(
float *inFeatures, float *outFeatures, float *w, RuleBook _rules, float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride); Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride);
...@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>( ...@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>(
Int output_nPlanes, Int output_stride); Int output_nPlanes, Int output_stride);
template void InputLayer_fp<float>(float *input_features, template void InputLayer_fp<float>(float *input_features,
float *output_features, Int nRows, float *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average); Int *rules_gpu, bool average);
template void InputLayer_bp<float>(float *d_input_features, template void InputLayer_bp<float>(float *d_input_features,
float *d_output_features, Int nRows, float *d_output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu, Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average); Int *rules_gpu, bool average);
template void LeakyReLU_fp<float>(float *input_features, float *output_features, template void LeakyReLU_fp<float>(float *input_features, float *output_features,
Int n, float alpha); Int n, float alpha);
template void LeakyReLU_bp<float>(float *input_features, template void LeakyReLU_bp<float>(float *input_features,
float *d_input_features, float *d_input_features,
float *output_features, Int n, float alpha); float *output_features, Int n, float alpha);
template void cuda_MaxPooling_ForwardPass<float>(float *input_features, template void cuda_MaxPooling_ForwardPass<float>(float *input_features,
float *output_features, float *output_features,
Int nPlanes, Int input_stride, Int nPlanes, Int input_stride,
Int output_stride, Int output_stride,
RuleBook _rules); RuleBook _rules);
template void cuda_MaxPooling_BackwardPass<float>( template void cuda_MaxPooling_BackwardPass<float>(
float *input_features, float *d_input_features, float *output_features, float *input_features, float *d_input_features, float *output_features,
float *d_output_features, Int nPlanes, Int input_stride, Int output_stride, float *d_output_features, Int nPlanes, Int input_stride, Int output_stride,
RuleBook _rules); RuleBook _rules);
template void cuda_SparseToDense_ForwardPass<float>(float *input_features, template void cuda_SparseToDense_ForwardPass<float>(float *input_features,
float *output_features, float *output_features,
Int nPlanes, Int nPlanes,
Int spatialVolume, Int spatialVolume,
RuleBook _rules); RuleBook _rules);
template void cuda_SparseToDense_BackwardPass<float>(float *d_input_features, template void cuda_SparseToDense_BackwardPass<float>(float *d_input_features,
float *d_output_features, float *d_output_features,
Int nPlanes, Int nPlanes,
Int spatialVolume, Int spatialVolume,
RuleBook _rules); RuleBook _rules);
template void cuda_UnPooling_ForwardPass<float>(float *input_features, template void cuda_UnPooling_ForwardPass<float>(float *input_features,
float *output_features, float *output_features,
Int nPlanes, Int input_stride, Int nPlanes, Int input_stride,
Int output_stride, Int output_stride,
RuleBook _rules); RuleBook _rules);
template void cuda_UnPooling_BackwardPass<float>(float *d_input_features, template void cuda_UnPooling_BackwardPass<float>(float *d_input_features,
float *d_output_features, float *d_output_features,
Int nPlanes, Int input_stride, Int nPlanes, Int input_stride,
Int output_stride, Int output_stride,
RuleBook _rules); RuleBook _rules);
template void bn_f<float>(float *iF, float *oF, Int nPlanes, Int input_stride, template void bn_f<float>(float *iF, float *oF, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, float *saveMean, Int output_stride, Int nActive, float *saveMean,
float *saveInvStd, float *runningMean, float *saveInvStd, float *runningMean,
float *runningVar, float *weight, float *bias, float *runningVar, float *weight, float *bias,
float eps, float momentum, bool train, float eps, float momentum, bool train,
float leakiness); float leakiness);
template void bn_b<float>(float *input_features, float *d_input_features, template void bn_b<float>(float *input_features, float *d_input_features,
float *output_features, float *d_output_features, float *output_features, float *d_output_features,
Int nPlanes, Int input_stride, Int output_stride, Int nPlanes, Int input_stride, Int output_stride,
Int nActive, float *saveMean, float *saveInvStd, Int nActive, float *saveMean, float *saveInvStd,
float *runningMean, float *runningVar, float *weight, float *runningMean, float *runningVar, float *weight,
float *bias, float *d_weight, float *d_bias, float *bias, float *d_weight, float *d_bias,
float leakiness); float leakiness);
template void bmd_f<float>(float *input_features, float *output_features, template void bmd_f<float>(float *input_features, float *output_features,
float *noise, Int nActive, Int nPlanes, float alpha); float *noise, Int nActive, Int nPlanes, float alpha);
template void bmd_b<float>(float *input_features, float *d_input_features, template void bmd_b<float>(float *input_features, float *d_input_features,
float *d_output_features, float *noise, Int nActive, float *d_output_features, float *noise, Int nActive,
Int nPlanes, float alpha); Int nPlanes, float alpha);
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
forward_pass_multiplyAdd_count = 0 forward_pass_multiplyAdd_count = 0
forward_pass_hidden_states = 0 forward_pass_hidden_states = 0
from .activations import Tanh, Sigmoid, ReLU, ELU, SELU, BatchNormELU from .activations import Tanh, Sigmoid, ReLU, LeakyReLU, ELU, SELU, BatchNormELU
from .averagePooling import AveragePooling from .averagePooling import AveragePooling
from .batchNormalization import BatchNormalization, BatchNormReLU, BatchNormLeakyReLU from .batchNormalization import BatchNormalization, BatchNormReLU, BatchNormLeakyReLU
from .classificationTrainValidate import ClassificationTrainValidate from .classificationTrainValidate import ClassificationTrainValidate
......
...@@ -22,6 +22,18 @@ class Sigmoid(Module): ...@@ -22,6 +22,18 @@ class Sigmoid(Module):
return output return output
class LeakyReLU(Module):
def __init__(self,leak=1/3):
Module.__init__(self)
self.leak=leak
def forward(self, input):
output = SparseConvNetTensor()
output.features = F.leaky_relu(input.features,self.leak)
output.metadata = input.metadata
output.spatial_size = input.spatial_size
return output
class Tanh(Module): class Tanh(Module):
def forward(self, input): def forward(self, input):
output = SparseConvNetTensor() output = SparseConvNetTensor()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment