"examples/llm/vscode:/vscode.git/clone" did not exist on "942a0fb9100ac006c67cbf42807917ea9863cbcf"
Commit 66986767 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

fixes

parent edf89af3
......@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
for (row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
auto out_f = output_features + row * nPlanes;
auto r = rules + row * (1 + maxActive);
for (Int i = 1; i <= nActive; ++i) {
auto in_f = input_features + nPlanes * rules[i];
auto in_f = input_features + r[i] * nPlanes;
for (Int plane = 0; plane < nPlanes; plane++) {
output_features[plane] += multiplier * in_f[plane];
out_f[plane] += multiplier * in_f[plane];
}
}
output_features += nPlanes;
rules += 1 + maxActive;
}
}
template <typename T>
......@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
for (row = 0; row < nRows; row++) {
auto nActive = rules[0];
T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
auto d_out_f = d_output_features + row * nPlanes;
auto r = rules + row * (1 + maxActive);
for (Int i = 1; i <= nActive; ++i) {
auto d_in_f = d_input_features + nPlanes * rules[i];
auto d_in_f = d_input_features + r[i] * nPlanes;
for (Int plane = 0; plane < nPlanes; plane++)
d_in_f[plane] += multiplier * d_output_features[plane];
d_in_f[plane] += multiplier * d_out_f[plane];
}
d_output_features += nPlanes;
rules += 1 + maxActive;
}
}
......
......@@ -5,10 +5,9 @@
// LICENSE file in the root directory of this source tree.
template <typename T>
void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive);
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS);
void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive);
template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
......@@ -84,7 +83,7 @@ void cuda_Convolution_backward(
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
Convolution_bp_bias(doF, db, op, nActiveOut);
}
}
}
......@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward(
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
Convolution_bp_bias(doF, db, op, nActive);
}
}
}
......@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward(
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
Convolution_bp_bias(doF, db, op, nActiveOut);
}
}
}
......@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward(
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
Convolution_bp_bias(doF, db, op, nActiveOut);
}
}
}
......@@ -5,10 +5,11 @@
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#define TACC double
template <typename T>
__global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes,
Int nActive) {
Int nActive) {
Int n = blockIdx.x * 32 + threadIdx.x;
T b = bias[n];
output_features += n;
......@@ -21,41 +22,38 @@ template <typename T>
void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) {
if (nPlanes / 32 > 0)
Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes,
nActive);
nActive);
if (nPlanes % 32 > 0) {
Int o = nPlanes / 32 * 32;
Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes,
nActive);
nActive);
}
}
template <typename T>
__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
Int i = blockIdx.x * 32 + threadIdx.x;
T t = 0;
for (Int j = blockIdx.y; j < nRows; j += 32)
t += matrix[j * nCOLUMNS + i];
atomicAdd(&target[i], t);
__global__ void Convolution_bp_bias_(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
Int n = blockIdx.x * 32 + threadIdx.x;
d_oF+=n;
TACC t = 0;
for (Int row = blockIdx.y; row < nActive; row += gridDim.y)
t += d_oF[row * nPlanes ];
atomicAdd(&d_b[n], t);
}
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
if (nColumns / 32 > 0)
dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(matrix, target, nRows, nColumns,
nCOLUMNS);
if (nColumns % 32 > 0) {
Int o = nColumns / 32 * 32;
dColumnSum<<<dim3(1, 32), nColumns - o>>>(matrix + o, target + o, nRows,
nColumns, nCOLUMNS);
void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
if (nPlanes / 32 > 0)
Convolution_bp_bias_<<<dim3(nPlanes / 32, 32), 32>>>(d_oF, d_b, nPlanes, nActive);
if (nPlanes % 32 > 0) {
Int o = nPlanes / 32 * 32;
Convolution_bp_bias_<<<dim3(1, 32), nPlanes - o>>>(d_oF + o, d_b + o, nPlanes, nActive);
}
}
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
......@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K;
w += n * K;
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
......@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K;
w += n * K;
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
......@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
......@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
}
template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
......@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
......@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
......@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = output_nPlanes / K;
Int m = blockIdx.y;
......@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
if (s + ty[v] < nHot) {
R0[v] = rules[2 * (s + ty[v])];
R1[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
......@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
......@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
......@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
......@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
......@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
......@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T>
double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR(
(dConvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;)
(dConvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;)
} else {
RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
T *w, T *dw, RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR(
(dConvolution_KMxKN_backward_dW2<
T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;)
(dConvolution_KMxKN_backward_dW2<
T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;)
} else {
RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; dw += c;)
dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; dw += c;)
}
}
#undef TACC
\ No newline at end of file
......@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward(
dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActiveOut);
Convolution_bp_bias(doF, db, op, nActiveOut);
}
}
}
......@@ -4,11 +4,13 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#define TACC double
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
......@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K;
w += n * K;
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
......@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures += n * K;
w += n * K;
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
Int R0[V];
......@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
......@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
}
template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
......@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
......@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
......@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
......@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w += n * K;
Int KO = min(K, output_nPlanes - K * n);
T O[V];
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ Int R[K * 2];
......@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0;
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
......@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
......@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
dw += m * K * output_nPlanes;
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
......@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
......@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
......@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
template <typename T>
double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
RuleBook _rules, Int input_nPlanes,
Int input_stride, Int output_nPlanes,
Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
double flops = 0;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR(
(dDeconvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;)
(dDeconvolution_KMxKN_forward2<
T, K,
V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride));
, w += c; flops += nHotB * c;)
} else {
RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
nHotB, input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
T *w, T *dw, RuleBook _rules,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
Int c = input_nPlanes * output_nPlanes;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
RULEBOOKITERATOR(
(dDeconvolution_KMxKN_backward_dW2<
T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;)
(dDeconvolution_KMxKN_backward_dW2<
T, K,
V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride, output_nPlanes, output_stride));
, w += c; dw += c;)
} else {
RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; dw += c;)
dOutFeatures, w, dw, rbB, nHotB,
input_nPlanes, input_stride,
output_nPlanes, output_stride);
, w += c; dw += c;)
}
}
#undef TACC
\ No newline at end of file
......@@ -15,15 +15,15 @@
#include "CUDA/UnPooling.cu"
template void ActivePooling_ForwardPass<float>(float *input_features,
float *output_features,
Int batchSize, Int maxActive,
Int nPlanes, Int *rules,
bool average);
float *output_features,
Int batchSize, Int maxActive,
Int nPlanes, Int *rules,
bool average);
template void ActivePooling_BackwardPass<float>(float *d_input_features,
float *d_output_features,
Int batchSize, Int maxActive,
Int nPlanes, Int *rules,
bool average);
float *d_output_features,
Int batchSize, Int maxActive,
Int nPlanes, Int *rules,
bool average);
template void dAffineReluTrivialConvolution_forward<float>(
float *inFeatures, float *outFeatures, float *affineWeight,
......@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>(
float *d_input_features, float *d_output_features, Int nPlanes,
Int input_stride, Int output_stride, RuleBook _rules, Int filterVolume);
template void Convolution_fp_bias<float>(float *of, float *b, Int op,
Int nActive);
template void Convolution_bp_bias<float>(float *matrix, float *target,
Int nRows, Int nColumns, Int nCOLUMNS);
template void Convolution_fp_bias<float>(float *oF, float *b, Int nPlanes,
Int nActive);
template void Convolution_bp_bias<float>(float *d_oF, float *d_b,
Int nPlanes, Int nActive);
template double dConvolution_forward2<float>(
float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride);
......@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>(
Int output_nPlanes, Int output_stride);
template void InputLayer_fp<float>(float *input_features,
float *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average);
float *output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average);
template void InputLayer_bp<float>(float *d_input_features,
float *d_output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average);
float *d_output_features, Int nRows,
Int maxActive, Int nPlanes, Int *rules_cpu,
Int *rules_gpu, bool average);
template void LeakyReLU_fp<float>(float *input_features, float *output_features,
Int n, float alpha);
Int n, float alpha);
template void LeakyReLU_bp<float>(float *input_features,
float *d_input_features,
float *output_features, Int n, float alpha);
float *d_input_features,
float *output_features, Int n, float alpha);
template void cuda_MaxPooling_ForwardPass<float>(float *input_features,
float *output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
float *output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
template void cuda_MaxPooling_BackwardPass<float>(
float *input_features, float *d_input_features, float *output_features,
float *d_output_features, Int nPlanes, Int input_stride, Int output_stride,
RuleBook _rules);
template void cuda_SparseToDense_ForwardPass<float>(float *input_features,
float *output_features,
Int nPlanes,
Int spatialVolume,
RuleBook _rules);
float *output_features,
Int nPlanes,
Int spatialVolume,
RuleBook _rules);
template void cuda_SparseToDense_BackwardPass<float>(float *d_input_features,
float *d_output_features,
Int nPlanes,
Int spatialVolume,
RuleBook _rules);
float *d_output_features,
Int nPlanes,
Int spatialVolume,
RuleBook _rules);
template void cuda_UnPooling_ForwardPass<float>(float *input_features,
float *output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
float *output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
template void cuda_UnPooling_BackwardPass<float>(float *d_input_features,
float *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
float *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride,
RuleBook _rules);
template void bn_f<float>(float *iF, float *oF, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, float *saveMean,
float *saveInvStd, float *runningMean,
float *runningVar, float *weight, float *bias,
float eps, float momentum, bool train,
float leakiness);
Int output_stride, Int nActive, float *saveMean,
float *saveInvStd, float *runningMean,
float *runningVar, float *weight, float *bias,
float eps, float momentum, bool train,
float leakiness);
template void bn_b<float>(float *input_features, float *d_input_features,
float *output_features, float *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int nActive, float *saveMean, float *saveInvStd,
float *runningMean, float *runningVar, float *weight,
float *bias, float *d_weight, float *d_bias,
float leakiness);
float *output_features, float *d_output_features,
Int nPlanes, Int input_stride, Int output_stride,
Int nActive, float *saveMean, float *saveInvStd,
float *runningMean, float *runningVar, float *weight,
float *bias, float *d_weight, float *d_bias,
float leakiness);
template void bmd_f<float>(float *input_features, float *output_features,
float *noise, Int nActive, Int nPlanes, float alpha);
float *noise, Int nActive, Int nPlanes, float alpha);
template void bmd_b<float>(float *input_features, float *d_input_features,
float *d_output_features, float *noise, Int nActive,
Int nPlanes, float alpha);
float *d_output_features, float *noise, Int nActive,
Int nPlanes, float alpha);
......@@ -6,7 +6,7 @@
forward_pass_multiplyAdd_count = 0
forward_pass_hidden_states = 0
from .activations import Tanh, Sigmoid, ReLU, ELU, SELU, BatchNormELU
from .activations import Tanh, Sigmoid, ReLU, LeakyReLU, ELU, SELU, BatchNormELU
from .averagePooling import AveragePooling
from .batchNormalization import BatchNormalization, BatchNormReLU, BatchNormLeakyReLU
from .classificationTrainValidate import ClassificationTrainValidate
......
......@@ -22,6 +22,18 @@ class Sigmoid(Module):
return output
class LeakyReLU(Module):
def __init__(self,leak=1/3):
Module.__init__(self)
self.leak=leak
def forward(self, input):
output = SparseConvNetTensor()
output.features = F.leaky_relu(input.features,self.leak)
output.metadata = input.metadata
output.spatial_size = input.spatial_size
return output
class Tanh(Module):
def forward(self, input):
output = SparseConvNetTensor()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment