"superbench/config/vscode:/vscode.git/clone" did not exist on "7f607e4f745b84fdac1c1b693b32bb65ca8a3c79"
Commit f9552033 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

initial commit

parents
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
#else
#include "Convolution.h"
#include "Deconvolution.h"
#include <algorithm>
extern "C" double scn_DR_(Deconvolution_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *output_features, THCTensor *weight, THCTensor *bias,
long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
if (not bias)
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto ip = input_features->size[1];
auto op = output_features->size[1];
auto w = THCTensor_(data)(state, weight);
double flops = 0;
if (bias) {
auto b = THCTensor_(data)(state, bias);
for (uInt i = 0; i < op; i += 32) {
uInt blockDim = min(32L, op - i);
uInt gridDim = min(4096, nActive);
Convolution_fp_bias
<< <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
(oF + i, b + i, op, op, nActive);
}
}
uInt c = ip * op;
RULEBOOKITERATOR(
dDeconvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; flops += nHotB * c;)
return flops;
}
extern "C" void scn_DR_(Deconvolution_backward)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
THLongTensor *filterStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
long filterVolume, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules =
_m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
auto ip = input_features->size[1];
auto op = d_output_features->size[1];
auto w = THCTensor_(data)(state, weight);
auto dw = THCTensor_(data)(state, d_weight);
uInt c = ip * op;
RULEBOOKITERATOR(dDeconvolution_backward_dW2<real>(
iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
THCState_getCurrentStream(state));
, w += c; dw += c;)
if (d_bias) {
auto db = THCTensor_(data)(state, d_bias);
Convolution_bp_bias(doF, db, op, op, nActive,
THCState_getCurrentStream(state));
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_DECONVOLUTION_H
#define GPU_DECONVOLUTION_H
#include "../SparseConvNet.h"
#include "Convolution.h"
template <typename T, uInt K, uInt V>
__global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, uInt K, uInt V>
__global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
FOO(64, 16)
FOO(32, 8)
FOO(16, 4)
FOO(8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
__global__ void dDeconvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
__global__ void dDeconvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R1[v] = rules[2 * (s + ty[v])];
R0[v] = rules[2 * (s + ty[v]) + 1];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
FOO(32, 8)
FOO(16, 4)
FOO(8, 2)
assert(false);
}
#undef FOO
template <typename T, uInt K, uInt V>
__global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
uInt n = blockIdx.y;
outFeatures += n * K;
w += n * K;
uInt KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ uInt R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
uInt KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KI; k++)
#pragma unroll
for (int v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T>
void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_forward2<T, K, V> << <
dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
return;
} else {
dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
input_nPlanes, input_stride, output_nPlanes,
output_stride, stream);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
__global__ void dDeconvolution_KMxKN_backward_dW2(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = (output_nPlanes + K - 1) / K;
uInt m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
uInt KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ uInt R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
for (int v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
uInt KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
if (ty[v] < 2) {
int q = ty[v] * K + tx;
if (s + q / 2 < nHot)
R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < KO; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int k = 0; k < K; k++)
#pragma unroll
for (int v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_backward_dW2<T, K, V> << <
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
return;
} else {
dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride, stream);
}
}
#endif /* GPU_DECONVOLUTION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
#else
#include "LeakyReLU.h"
extern "C" void scn_R_(LeakyReLU_updateOutput)(THCTensor *input_features,
THCTensor *output_features,
float alpha) {
if (input_features != output_features)
THCTensor_(resizeAs)(state, output_features, input_features);
auto n = THCTensor_(nElement)(state, input_features);
LeakyReLU_fp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
(THCTensor_(data)(state, input_features),
THCTensor_(data)(state, output_features), n, alpha);
}
extern "C" void scn_R_(LeakyReLU_updateGradInput)(THCTensor *input_features,
THCTensor *d_input_features,
THCTensor *d_output_features,
float alpha) {
if (d_input_features != d_output_features)
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
auto n = THCTensor_(nElement)(state, d_input_features);
LeakyReLU_bp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
(THCTensor_(data)(state, input_features),
THCTensor_(data)(state, d_input_features),
THCTensor_(data)(state, d_output_features), n, alpha);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template <typename T>
__global__ void LeakyReLU_fp(T *input_features, T *output_features, uInt n,
T alpha) {
for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features[i] = (input_features[i] > 0) ? input_features[i]
: (input_features[i] * alpha);
}
template <typename T>
__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
T *d_output_features, uInt n, T alpha) {
for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
#else
#include "MaxPooling.h"
#include "RuleBookIterator.h"
extern "C" void scn_DR_(MaxPooling_updateOutput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
THCTensor_(zero)(state, output_features);
auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(
MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
nPlanes, input_features->size[1],
output_features->size[1], rbB, nHotB);
, )
}
extern "C" void scn_DR_(MaxPooling_updateGradInput)(
THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
THLongTensor *poolStride, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *output_features,
THCTensor *d_output_features, long nFeaturesToDrop,
THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
auto _rules =
_m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
uInt nActive = _m.getNActive(outputSize);
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(
MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
oF, doF, nPlanes, input_features->size[1],
d_output_features->size[1], rbB, nHotB);
, )
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_MAXPOOLING_H
#define GPU_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
__global__ void MaxPooling_fp(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) {
T inp = input_features[i + plane];
if (output_features[o + plane] < inp)
output_features[o + plane] = inp;
}
}
__syncthreads();
}
}
template <typename T>
void MaxPooling_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
MaxPooling_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
template <typename T, uInt NTX, uInt NTY>
__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void MaxPooling_BackwardPass(cudaStream_t stream, T *input_features,
T *d_input_features, T *output_features,
T *d_output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt *rules,
uInt nHot) {
MaxPooling_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rules, nHot);
}
#endif /* GPU_MAXPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
#else
#include "Convolution.h"
#include <algorithm>
extern "C" double
scn_R_(NetworkInNetwork_updateOutput)(THCTensor *input_features_,
THCTensor *output_features_,
THCTensor *weight_, THCTensor *bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THCTensor_(resize2d)(state, output_features_, nActive, output_nPlanes);
auto input_features = THCTensor_(data)(state, input_features_);
auto output_features = THCTensor_(data)(state, output_features_);
auto weight = THCTensor_(data)(state, weight_);
if (bias_ != nullptr) {
auto bias = THCTensor_(data)(state, bias_);
for (uInt i = 0; i < output_nPlanes; i += 32) {
uInt blockDim = min(32L, output_nPlanes - i);
uInt gridDim = min(4096L, nActive);
Convolution_fp_bias<<<gridDim, blockDim, 0,
THCState_getCurrentStream(state)>>>(
output_features + i, bias + i, output_nPlanes, output_nPlanes,
nActive);
}
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is m*r (row-major)
// output_features is l*r (row-major)
// buffer * weights + bias -> output_features
THBLAS_GEMM(state, 'n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // r
input_features,
input_nPlanes, // m
1, // beta
output_features,
output_nPlanes // r
);
} else {
THCTensor_(zero)(state, output_features_);
THBLAS_GEMM(state, 'n', 'n',
output_nPlanes, // r
nActive, // l
input_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // r
input_features,
input_nPlanes, // m
0, // beta
output_features,
output_nPlanes // r
);
}
return nActive * input_nPlanes * output_nPlanes;
}
extern "C" void
scn_R_(NetworkInNetwork_updateGradInput)(THCTensor *d_input_features_,
THCTensor *d_output_features_,
THCTensor *weight_) {
auto nActive = d_output_features_->size[0];
auto input_nPlanes = weight_->size[0];
auto output_nPlanes = weight_->size[1];
THCTensor_(resize2d)(state, d_input_features_, nActive, input_nPlanes);
THCTensor_(zero)(state, d_input_features_);
auto d_input_features = THCTensor_(data)(state, d_input_features_);
auto d_output_features = THCTensor_(data)(state, d_output_features_);
auto weight = THCTensor_(data)(state, weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is r*m (row-major)
// d_buffer is l*r (row-major)
// d_output_features * T(weight) -> d_buffer
THBLAS_GEMM(state, 't', 'n',
input_nPlanes, // r
nActive, // l
output_nPlanes, // m
1, // alpha
weight,
output_nPlanes, // m
d_output_features,
output_nPlanes, // m
0, // beta
d_input_features,
input_nPlanes // r
);
}
extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
THCTensor *input_features_, THCTensor *d_output_features_,
THCTensor *d_weight_, THCTensor *d_bias_) {
auto nActive = input_features_->size[0];
auto input_nPlanes = d_weight_->size[0];
auto output_nPlanes = d_weight_->size[1];
auto input_features = THCTensor_(data)(state, input_features_);
auto d_output_features = THCTensor_(data)(state, d_output_features_);
auto d_weight = THCTensor_(data)(state, d_weight_);
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is m*l (row-major)
// d_output_features is m*r (row-major)
// weights is l*r (row-major)
// T(buffer) * d_output_features -> d_weight
THBLAS_GEMM(state, 'n', 't',
output_nPlanes, // r
input_nPlanes, // l
nActive, // m
1, // alpha
d_output_features,
output_nPlanes, // r
input_features,
input_nPlanes, // l
1, // beta
d_weight,
output_nPlanes // r
);
if (d_bias_) {
auto d_bias = THCTensor_(data)(state, d_bias_);
Convolution_bp_bias(d_output_features, d_bias, output_nPlanes,
output_nPlanes, nActive,
THCState_getCurrentStream(state));
}
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_RULEBOOKITERATOR_H
#define GPU_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to GPU memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
if (THCITensor_nElement(state, rulesBuffer) < ms) \
THCITensor_resize1d(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
} \
if (nHotB) { \
X \
} \
Y \
}
#endif /* GPU_RULEBOOKITERATOR_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/SparseToDense.cu"
#else
#include "SparseToDense.h"
extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
void **m,
THCTensor *input_features,
THCTensor *output_features,
THCITensor *rulesBuffer) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
long sz[Dimension + 2];
sz[0] = _m.inputSGs->size();
sz[1] = input_features->size[1];
for (int i = 0; i < Dimension; i++) {
auto x = THLongTensor_data(inputSize)[i];
sz[i + 2] = x;
}
THCTensor_(resizeNd)(state, output_features, Dimension + 2, sz, NULL);
THCTensor_(zero)(state, output_features);
}
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = input_features->size[1];
auto iF = THCTensor_(data)(state, input_features);
auto oF = THCTensor_(data)(state, output_features);
RULEBOOKITERATOR(
SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
nPlanes, spatialVolume, rbB, nHotB);
, oF++;) // todo check ++ or +=spatialVolume????zzz
}
extern "C" void scn_DR_(SparseToDense_updateGradInput)(
THLongTensor *inputSize, void **m, THCTensor *input_features,
THCTensor *d_input_features, THCTensor *d_output_features,
THCITensor *rulesBuffer) {
THCTensor_(resizeAs)(state, d_input_features, input_features);
THCTensor_(zero)(state, d_input_features);
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
auto spatialVolume = _rules.size();
uInt nPlanes = d_input_features->size[1];
auto diF = THCTensor_(data)(state, d_input_features);
auto doF = THCTensor_(data)(state, d_output_features);
RULEBOOKITERATOR(
SparseToDense_BackwardPass<real>(THCState_getCurrentStream(state), diF,
doF, nPlanes, spatialVolume, rbB, nHotB);
, doF++;)
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
__global__ void SparseToDense_fp(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *i = &input_features[r[2 * threadIdx.y] * nPlanes];
T *o = &output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
o[plane*spatialVolume]=i[plane];
}
__syncthreads();
}
}
template <typename T>
void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes,
uInt spatialVolume,
uInt *rules, uInt nHot) {
SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
input_features, output_features, nPlanes, spatialVolume, rules, nHot);
}
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *i = &d_input_features[r[2 * threadIdx.y] * nPlanes];
T *o = &d_output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
i[plane]=o[plane*spatialVolume];
}
__syncthreads();
}
}
template <typename T>
void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes,
uInt spatialVolume,
uInt *rules, uInt nHot) {
SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot);
}
#endif /* GPU_SPARSETODENSE_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error \
"You must define TH_GENERIC_FILE before including THGenerateCudaFloatTypes.h"
#endif
// float
#define real float
#define accreal double
#define Real Float
#define CReal Cuda
#define TH_REAL_IS_FLOAT
#define THBLAS_GEMM THCudaBlas_Sgemm
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef CReal
#undef TH_REAL_IS_FLOAT
#undef THBLAS_GEMM
#undef TH_GENERIC_FILE
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimCudaFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef ACTIVEPOOLING_H
#define ACTIVEPOOLING_H
#include "../SparseConvNet.h"
// Return the maximum number of active sites in the batch
// rules has size 1.
// rules[0] is a batchSize x (maxActive + 1) matrix.
// First column is number of active sites for that sample (<= maxActive)
// Remaining maxActive columns give the active sites, zero padded.
template <uInt dimension>
void activePoolingRules(SparseGrids<dimension> &SGs, RuleBook &rules) {
rules.clear();
rules.resize(2);
auto &r = rules[0];
uInt maxActive = 0;
for (auto &sg : SGs)
maxActive = std::max(maxActive, (uInt)sg.mp.size());
for (auto &sg : SGs) {
r.push_back(sg.mp.size());
for (auto &iter : sg.mp)
r.push_back(sg.ctr + iter.second);
while (rules.size() % (maxActive + 1) != 0)
r.push_back(0); // padding
}
rules[1].push_back(SGs.size());
rules[1].push_back(maxActive);
}
#endif /* ACTIVEPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CONVOLUTIONRULES_H
#define CONVOLUTIONRULES_H
#include "RectangularRegions.h"
template <uInt dimension>
void Convolution_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
SparseGrid<dimension> &outputGrid,
RuleBook &rules, long *size,
long *stride, long *inputSpatialSize,
long *outputSpatialSize) {
rules.resize(volume<dimension>(size));
for (auto const &inIter : inputGrid.mp) {
for (auto j : OutputRegionCalculator<dimension>(inIter.first, size, stride,
outputSpatialSize)) {
auto inRegion = InputRegionCalculator<dimension>(j, size, stride);
uInt rulesOffset = inRegion.offset(inIter.first);
auto outIter = outputGrid.mp.find(j);
if (outIter == outputGrid.mp.end()) {
outIter =
outputGrid.mp.insert(std::make_pair(j, outputGrid.ctr++)).first;
}
rules[rulesOffset].push_back(inIter.second + inputGrid.ctr);
rules[rulesOffset].push_back(outIter->second);
}
}
}
template <uInt dimension>
uInt Convolution_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
SparseGrids<dimension> &output_SGs,
RuleBook &rules, long *filterSize,
long *filterStride,
long *input_spatialSize,
long *output_spatialSize) {
rules.clear();
output_SGs.clear();
uInt batchSize = input_SGs.size();
output_SGs.resize(batchSize);
uInt output_nActive = 0;
for (uInt i = 0; i < batchSize; i++) {
auto &iSG = input_SGs[i];
auto &oSG = output_SGs[i];
oSG.ctr = output_nActive;
Convolution_InputSgToRulesAndOutputSg<dimension>(
iSG, oSG, rules, filterSize, filterStride, input_spatialSize,
output_spatialSize);
output_nActive = oSG.ctr;
oSG.ctr = 0;
}
return output_nActive;
}
template <uInt dimension>
uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(
SparseGrids<dimension> &input_SGs, SparseGrids<dimension> &output_SGs,
RuleBook &rules, long *filterSize, long *filterStride,
long *input_spatialSize, long *output_spatialSize) {
rules.clear();
rules.resize(volume<dimension>(filterSize));
output_SGs.clear();
uInt batchSize = input_SGs.size();
output_SGs.resize(batchSize);
std::vector<RuleBook> rbs(batchSize);
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < batchSize; i++)
Convolution_InputSgToRulesAndOutputSg<dimension>(
input_SGs[i], output_SGs[i], rbs[i], filterSize, filterStride,
input_spatialSize, output_spatialSize);
}
uInt output_nActive = 0;
for (uInt i = 0; i < batchSize; i++) {
// Parallel assignment:
// output_nActive <- output_nActive+output_SGs[i].ctr
// output_SGs[i].ctr <- output_nActive
uInt tmp = output_nActive;
output_nActive += output_SGs[i].ctr;
output_SGs[i].ctr = tmp;
}
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < rules.size(); i++) {
auto &R = rules[i];
for (uInt j = 0; j < batchSize; j++) {
auto &r = rbs[j][i];
auto offset = output_SGs[j].ctr;
for (uInt k = 0; k < r.size();) {
R.push_back(r[k++]);
R.push_back(r[k++] + offset);
}
}
}
}
return output_nActive;
}
// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs
template <uInt dimension>
void SparseToDense_InputSgsToRulesAndOutputSgs(
SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
uInt batchSize = input_SGs.size();
SparseGrids<dimension> output_SGs(batchSize);
std::vector<long> ones(dimension, 1);
rules.clear();
for (uInt i = 0; i < batchSize; i++) {
auto &iSG = input_SGs[i];
auto &oSG = output_SGs[i];
oSG.ctr = i; // batchIdx
Convolution_InputSgToRulesAndOutputSg<dimension>(
iSG, oSG, rules, spatialSize, &ones[0], spatialSize, &ones[0]);
}
}
template <uInt dimension>
void SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
uInt batchSize = input_SGs.size();
SparseGrids<dimension> output_SGs(batchSize);
std::vector<long> ones(dimension, 1);
rules.clear();
rules.resize(volume<dimension>(spatialSize));
std::vector<RuleBook> rbs(batchSize);
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < batchSize; i++) {
output_SGs[i].ctr = i; // batchIdx
Convolution_InputSgToRulesAndOutputSg<dimension>(
input_SGs[i], output_SGs[i], rbs[i], spatialSize, &ones[0],
spatialSize, &ones[0]);
}
}
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < rules.size(); i++) {
auto &R = rules[i];
for (uInt j = 0; j < batchSize; j++) {
auto &r = rbs[j][i];
for (uInt k = 0; k < r.size();) {
R.push_back(r[k++]);
R.push_back(r[k++]);
}
}
}
}
}
#endif /* CONVOLUTIONRULES_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Geometry/Metadata.cpp"
#else
#include "Metadata.h"
#include <cstring>
extern "C" void scn_D_(setInputSpatialSize)(void **m,
THLongTensor *spatialSize) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.setInputSpatialSize(spatialSize);
}
extern "C" void scn_D_(batchAddSample)(void **m) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
assert(_m.inputSGs && "Call setInputSpatialSize first, please!");
_m.inputSGs->resize(_m.inputSGs->size() + 1);
_m.inputSG = &_m.inputSGs->back();
}
extern "C" void scn_D_(setInputSpatialLocation)(void **m,
THFloatTensor *features,
THLongTensor *location,
THFloatTensor *vec,
bool overwrite) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto p = LongTensorToPoint<Dimension>(location);
auto &mp = _m.inputSG->mp;
auto &nActive = *_m.inputNActive;
auto iter = mp.find(p);
auto nPlanes = vec->size[0];
if (iter == mp.end()) {
iter = mp.insert(std::make_pair(p, nActive++)).first;
THFloatTensor_resize2d(features, nActive, nPlanes);
std::memcpy(THFloatTensor_data(features) + (nActive - 1) * nPlanes,
THFloatTensor_data(vec), sizeof(float) * nPlanes);
} else if (overwrite) {
std::memcpy(THFloatTensor_data(features) + iter->second * nPlanes,
THFloatTensor_data(vec), sizeof(float) * nPlanes);
}
}
extern "C" void
scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
THLongTensor *pad_,
THLongTensor *nz_, long batchSize) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
_m.setInputSpatialSize(spatialSize_);
_m.inputSGs->resize(batchSize);
auto &nActive = *_m.inputNActive;
nActive = nz_->size[0];
auto nz = THLongTensor_data(nz_);
auto pad = THLongTensor_data(pad_);
auto spatialSize = THLongTensor_data(spatialSize_);
std::vector<uInt> br(batchSize + 1);
if (batchSize == 1) {
br[1] = nActive;
} else {
long b = 0;
for (uInt i = 0; i < nActive; i++) {
long B = nz[i * (Dimension + 1)];
for (; b < B;)
br[++b] = i;
}
for (; b < batchSize;)
br[++b] = nActive;
}
uInt b;
#pragma omp parallel for private(b)
for (b = 0; b < batchSize; b++) {
auto &sg = _m.inputSGs->at(b);
for (uInt i = br[b]; i < br[b + 1]; i++) {
Point<Dimension> x;
for (uInt j = 0; j < Dimension; j++) {
x[j] = nz[i * (Dimension + 1) + j + 1] +
pad[b * Dimension + j]; // 0-indexed
}
sg.mp[x] = i;
}
}
}
// tensor is size[0] x .. x size[Dimension-1] x size[Dimension]
// size[0] x .. x size[Dimension-1] == spatial volume
// size[Dimension] == #feature planes
extern "C" void scn_D_(addSampleFromThresholdedTensor)(
void **m, THFloatTensor *features_, THFloatTensor *tensor_,
THLongTensor *offset_, THLongTensor *spatialSize_, float threshold) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
auto &nActive = *_m.inputNActive;
auto &SGs = *_m.inputSGs;
SGs.resize(SGs.size() + 1);
auto &sg = SGs.back();
auto tensor = THFloatTensor_data(tensor_);
auto offset = THLongTensor_data(offset_);
auto spatialSize = THLongTensor_data(spatialSize_);
long *size = tensor_->size;
auto nPlanes = size[Dimension];
long volume = 1;
for (int i = 0; i < Dimension; ++i)
volume *= size[i];
THFloatTensor_resize2d(features_, nActive + volume, nPlanes);
// Increment pointers as we work through the data
auto features = THFloatTensor_data(features_) + nActive * nPlanes;
// Active locations
Point<Dimension> point;
for (uInt i = 0; i < Dimension; i++)
point[i] = offset[i];
for (uInt ctr = 0; ctr < volume; ctr++) {
bool active = false;
for (uInt i = 0; i < nPlanes; i++) {
if (fabs(tensor[i]) > threshold) {
active = true;
break;
}
}
for (uInt i = 0; i < Dimension; i++) {
if (point[i] < 0 or point[i] >= spatialSize[i]) {
active = false;
break;
}
}
if (active) {
sg.mp[point] = nActive++;
std::memcpy(features, tensor, sizeof(float) * nPlanes);
features += nPlanes;
}
tensor += nPlanes;
incrementPointInCube<Dimension>(point, size, offset);
}
THFloatTensor_resize2d(features_, nActive, nPlanes);
}
// 3x3 valid convolutions, 3x3/2x2 pooling or strided convolutions
extern "C" void scn_D_(generateRuleBooks3s2)(void **m) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
long sz[Dimension], str[Dimension], inS[Dimension], outS[Dimension];
Point<Dimension> p1;
Point<2 * Dimension> p2;
Point<3 * Dimension> p3;
for (int i = 0; i < Dimension; ++i) {
p1[i] = p2[i] = p3[i] = inS[i] = _m.inputSpatialSize[i];
p2[i + Dimension] = p3[i + Dimension] = sz[i] = 3;
p3[i + 2 * Dimension] = str[i] = 2;
}
while (true) {
auto &SGs = _m.grids[p1];
auto &rb = _m.validRuleBooks[p2];
if (rb.empty())
ValidConvolution_SgsToRules(SGs, rb, sz);
for (int i = 0; i < Dimension; ++i)
if (p1[i] < 3 or p1[i] % 2 != 1)
return;
else
p1[i] = outS[i] = (inS[i] - 1) / 2;
auto &SGs2 = _m.grids[p1];
auto &rb2 = _m.ruleBooks[p3];
if (rb2.empty())
_m.nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(
SGs, SGs2, rb2, sz, str, inS, outS);
for (int i = 0; i < Dimension; ++i)
p2[i] = p3[i] = inS[i] = outS[i];
}
}
// 3x3 valid convolutions, 2x2 pooling or strided convolutions
extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
long s2[Dimension], s3[Dimension], inS[Dimension], outS[Dimension];
Point<Dimension> p1;
Point<2 * Dimension> p2;
Point<3 * Dimension> p3;
for (int i = 0; i < Dimension; ++i) {
p1[i] = p2[i] = p3[i] = inS[i] = _m.inputSpatialSize[i];
p2[i + Dimension] = s3[i] = 3;
p3[i + Dimension] = p3[i + 2 * Dimension] = s2[i] = 2;
}
while (true) {
auto &SGs = _m.grids[p1];
auto &rb = _m.validRuleBooks[p2];
ValidConvolution_SgsToRules(SGs, rb, s3);
for (int i = 0; i < Dimension; ++i)
if (p1[i] < 2 or p1[i] % 2 != 0)
return;
else
p1[i] = outS[i] = inS[i] / 2;
auto &SGs2 = _m.grids[p1];
auto &rb2 = _m.ruleBooks[p3];
if (rb2.empty())
_m.nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(
SGs, SGs2, rb2, s2, s2, inS, outS);
for (int i = 0; i < Dimension; ++i)
p2[i] = p3[i] = inS[i] = outS[i];
}
}
extern "C" void scn_D_(freeMetadata)(void **m) {
SCN_DELETE(Metadata<Dimension>, m)
}
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef Metadata_H
#define Metadata_H
#include "../SparseConvNet.h"
#include "ActivePoolingRules.h"
#include "ConvolutionRules.h"
#include "ValidConvolutionRules.h"
#include <iostream>
#include <tuple>
#include <unordered_map>
template <uInt dimension> class Metadata {
public:
std::unordered_map<Point<dimension>, uInt, IntArrayHash<dimension>> nActive;
std::unordered_map<Point<dimension>, SparseGrids<dimension>,
IntArrayHash<dimension>> grids;
std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
activePoolingRuleBooks;
std::unordered_map<Point<2 * dimension>, RuleBook,
IntArrayHash<2 * dimension>> validRuleBooks;
std::unordered_map<Point<3 * dimension>, RuleBook,
IntArrayHash<3 * dimension>> ruleBooks;
std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
sparseToDenseRuleBooks;
Point<dimension> inputSpatialSize;
SparseGrids<dimension> *inputSGs;
SparseGrid<dimension> *inputSG;
uInt *inputNActive;
Metadata() {}
void setInputSpatialSize(THLongTensor *spatialSize) {
inputSpatialSize = LongTensorToPoint<dimension>(spatialSize);
inputSGs = &grids[inputSpatialSize];
inputNActive = &nActive[inputSpatialSize];
}
SparseGrids<dimension> &getSparseGrid(THLongTensor *spatialSize) {
return grids[LongTensorToPoint<dimension>(spatialSize)];
};
uInt getNActive(THLongTensor *spatialSize) {
return nActive[LongTensorToPoint<dimension>(spatialSize)];
};
RuleBook &getValidRuleBook(THLongTensor *spatialSize, THLongTensor *size,
bool openMP) {
auto p = TwoLongTensorsToPoint<dimension>(spatialSize, size);
auto &rb = validRuleBooks[p];
if (rb.empty()) {
auto &SGs = grids[LongTensorToPoint<dimension>(spatialSize)];
#if defined(ENABLE_OPENMP)
openMP ? ValidConvolution_SgsToRules_OMP(SGs, rb, THLongTensor_data(size))
:
#endif
ValidConvolution_SgsToRules(SGs, rb, THLongTensor_data(size));
}
return rb;
}
RuleBook &getActivePoolingRuleBook(THLongTensor *spatialSize) {
auto spatialSz = LongTensorToPoint<dimension>(spatialSize);
auto &SGs = grids[spatialSz];
auto &rb = activePoolingRuleBooks[spatialSz];
if (rb.empty())
activePoolingRules(SGs, rb);
return rb;
}
RuleBook &getSparseToDenseRuleBook(THLongTensor *spatialSize, bool openMP) {
auto ss = LongTensorToPoint<dimension>(spatialSize);
auto &SGs = grids[ss];
auto &rb = sparseToDenseRuleBooks[ss];
if (rb.empty())
#if defined(ENABLE_OPENMP)
openMP ? SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
SGs, rb, THLongTensor_data(spatialSize))
:
#endif
SparseToDense_InputSgsToRulesAndOutputSgs(
SGs, rb, THLongTensor_data(spatialSize));
return rb;
}
RuleBook &getRuleBook(THLongTensor *inputSpatialSize,
THLongTensor *outputSpatialSize, THLongTensor *size,
THLongTensor *stride, bool openMP) {
auto p = ThreeLongTensorsToPoint<dimension>(inputSpatialSize, size, stride);
auto &rb = ruleBooks[p];
if (rb.empty()) {
auto iS = LongTensorToPoint<dimension>(inputSpatialSize);
auto oS = LongTensorToPoint<dimension>(outputSpatialSize);
auto &iSGs = grids[iS];
auto &oSGs = grids[oS];
nActive[oS] =
#if defined(ENABLE_OPENMP)
openMP ? Convolution_InputSgsToRulesAndOutputSgs_OMP(
iSGs, oSGs, rb, THLongTensor_data(size),
THLongTensor_data(stride),
THLongTensor_data(inputSpatialSize),
THLongTensor_data(outputSpatialSize))
:
#endif
Convolution_InputSgsToRulesAndOutputSgs(
iSGs, oSGs, rb, THLongTensor_data(size),
THLongTensor_data(stride),
THLongTensor_data(inputSpatialSize),
THLongTensor_data(outputSpatialSize));
}
return rb;
}
};
#endif
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef RECTANGULARREGIONS_H
#define RECTANGULARREGIONS_H
#include "../SparseConvNet.h"
// For iterating over the rectangular region with corners lb and ub.
// The .end() method and operator!= are designed to allow range based for
// loops of the region, but nothing else.
template <uInt dimension> class RectangularRegionIterator;
template <uInt dimension> class RectangularRegion {
public:
Point<dimension> lb;
Point<dimension> ub;
RectangularRegion(Point<dimension> &lb, Point<dimension> &ub)
: lb(lb), ub(ub) {}
RectangularRegionIterator<dimension> begin() {
return RectangularRegionIterator<dimension>(*this, lb);
}
RectangularRegionIterator<dimension> end() {
// Not really used by the custom operator!= function
// Otherwise it would need to represent a point just outside the region
return RectangularRegionIterator<dimension>(*this, ub);
}
uInt
offset(const Point<dimension> &p) { // Enumerate the points inside the region
uInt of = 0, m = 1;
for (Int i = dimension - 1; i >= 0; i--) {
of += m * (p[i] - lb[i]);
m *= ub[i] - lb[i] + 1;
}
return of;
}
};
template <uInt dimension> class RectangularRegionIterator {
private:
RectangularRegion<dimension> &region;
public:
bool stillLooping;
Point<dimension> point;
RectangularRegionIterator(RectangularRegion<dimension> &region,
Point<dimension> &point)
: region(region), point(point), stillLooping(true) {
// If stride > size, we can have lb[i]>ub[i] meaning region_size == 0
for (Int i = 0; i < dimension; i++)
if (point[i] > region.ub[i])
stillLooping = false;
}
RectangularRegionIterator<dimension> &operator++() {
for (Int i = dimension - 1;;) {
point[i]++;
if (point[i] <= region.ub[i])
break;
point[i] = region.lb[i];
i--;
if (i == -1) {
stillLooping = false; // Signal to operator!= to end iteration
break;
}
}
return *this;
}
Point<dimension> &operator*() { return point; }
};
// Only to be used for checking the end point of range based for loops.
template <uInt dimension>
inline bool operator!=(const RectangularRegionIterator<dimension> &lhs,
const RectangularRegionIterator<dimension> &rhs) {
return lhs.stillLooping;
}
// Similar to above but for [ offset[0] ... offset[0]+size[0]-1 ] x ... x [..]
template <uInt dimension>
void incrementPointInCube(Point<dimension> &point, long *size, long *offset) {
for (Int i = dimension - 1; i >= 0; i--) {
point[i]++;
if (point[i] < offset[i] + size[i])
break;
point[i] = offset[i];
}
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the input field corresponding to a point in the output.
template <uInt dimension>
RectangularRegion<dimension>
InputRegionCalculator(const Point<dimension> &output, long *size,
long *stride) {
Point<dimension> lb, ub;
for (uInt i = 0; i < dimension; i++) {
lb[i] = output[i] * stride[i];
ub[i] = output[i] * stride[i] + size[i] - 1;
}
return RectangularRegion<dimension>(lb, ub);
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the output field corresponding to a point in the input.
template <uInt dimension>
RectangularRegion<dimension>
OutputRegionCalculator(const Point<dimension> &input, long *size, long *stride,
long *outputSpatialSize) {
Point<dimension> lb, ub;
for (uInt i = 0; i < dimension; i++) {
lb[i] = std::max(0L, (input[i] - size[i] + stride[i]) / stride[i]);
ub[i] = std::min(outputSpatialSize[i] - 1, input[i] / stride[i]);
}
return RectangularRegion<dimension>(lb, ub);
}
#endif /* RECTANGULARREGIONS_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateDimTypes.h"
#endif
#define Dimension 1
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 2
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 3
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 4
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 5
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 6
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 7
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 8
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 9
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 10
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#undef TH_GENERIC_FILE
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef VALIDCONVOLUTIONRULES_H
#define VALIDCONVOLUTIONRULES_H
#include<iostream>
// Full input region for an output point
template <uInt dimension>
RectangularRegion<dimension>
InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
Point<dimension> lb, ub;
for (uInt i = 0; i < dimension; i++) {
Int pad = size[i] / 2;
lb[i] = output[i] - pad;
ub[i] = output[i] + size[i] - 1 - pad;
}
return RectangularRegion<dimension>(lb, ub);
}
// Call for each convolutional / max-pooling layer, once for each batch item.
// rules is used to carry out the "lowering" whilst carrying out the convolution
template <uInt dimension>
double ValidConvolution_SgToRules(SparseGrid<dimension> &grid,
RuleBook &rules, long *size) {
uInt sd = volume<dimension>(size);
double countActiveInputs = 0;
for (auto const &outputIter : grid.mp) {
auto inRegion =
InputRegionCalculator_Valid<dimension>(outputIter.first, size);
uInt rulesOffset = 0;
for (auto inputPoint : inRegion) {
auto inputIter = grid.mp.find(inputPoint);
if (inputIter != grid.mp.end()) {
rules[rulesOffset].push_back(inputIter->second + grid.ctr);
rules[rulesOffset].push_back(outputIter.second + grid.ctr);
countActiveInputs++;
}
rulesOffset++;
}
}
return countActiveInputs;
}
template <uInt dimension>
uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs,
RuleBook &rules, long *size) {
uInt sd = volume<dimension>(size);
uInt countActiveInputs = 0;
rules.clear();
rules.resize(sd);
for (uInt i = 0; i < SGs.size(); i++)
countActiveInputs +=
ValidConvolution_SgToRules<dimension>(SGs[i], rules, size);
return countActiveInputs;
}
template <uInt dimension>
uInt ValidConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
RuleBook &rules, long *size) {
std::vector<RuleBook> rbs(SGs.size());
std::vector<double> countActiveInputs(SGs.size());
rules.clear();
uInt sd = volume<dimension>(size);
rules.resize(sd);
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < SGs.size(); i++) {
rbs[i].resize(sd);
countActiveInputs[i] =
ValidConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
}
}
{
uInt i;
#pragma omp parallel for private(i)
for (i = 0; i < sd; i++)
for (auto const &rb : rbs)
rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end());
}
uInt countActiveInputs_ = 0;
for (auto &i : countActiveInputs)
countActiveInputs_ += i;
return countActiveInputs_;
}
#endif /* VALIDCONVOLUTIONRULES_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef SPARSECONVNET_H
#define SPARSECONVNET_H
// To use 64 bits instead of 32, replace 32bits.h with 64bits.h
#include "32bits.h"
#include <array>
#include <cstdint>
#include <google/dense_hash_map>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#if defined(ENABLE_OPENMP)
#include <omp.h>
#endif
// Submanifold Sparse Convolutional Networks
// A batch of samples, for each layer of a sparse convolutional network, is
// encoded as a matrix of nActive x nFeatures and a vector of
// hash tables identifying points in space with the rows of
// the matrix.
// SparseGridMap<dimension> - a hash table assigning integer labels to a sparse
// collection of 'Point<dimension>' points
template <uInt dimension>
using SparseGridMap =
google::dense_hash_map<Point<dimension>, int, IntArrayHash<dimension>,
std::equal_to<Point<dimension>>>;
template <uInt dimension> class SparseGrid {
public:
uInt ctr; // Count #active sites during output hash construction. Then store
// offset within a batch.
SparseGridMap<dimension> mp;
SparseGrid() : ctr(0) {
// Sparsehash needs a key to be set aside and never used - we use
// (Int_MAX,...,Int_MAX)
Point<dimension> empty_key;
for (uInt i = 0; i < dimension; ++i)
empty_key[i] = Int_MAX;
mp.set_empty_key(empty_key);
}
};
template <uInt dimension>
using SparseGrids = std::vector<SparseGrid<dimension>>;
// Each convolution/pooling operation requires the calculation of a 'rulebook'
// setting out how the output points depend on the points in the layer below
using RuleBook = std::vector<std::vector<uInt>>;
// Code relating to squares/cubes/rectangles/cuboids etc
// integer powers - ok for filter sizes, could overflow if we calculate
// inputSpatialSize^d
template <uInt m> uInt ipow(uInt n) { return n * ipow<m - 1>(n); }
template <> uInt ipow<1>(uInt n) { return n; }
template <> uInt ipow<0>(uInt n) { return 1; }
template <uInt dimension> uInt volume(long *point) {
uInt v = 1;
for (uInt i = 0; i < dimension; i++)
v *= point[i];
return v;
}
// Macro to initialize arguments passed as void*[1] from Lua.
// This allows Lua to take ownership of arbitrary C++ objects.
// The macro:
// - takes a pointer to a pointer [allocated as ffi.new('void *[1]') in Lua]
// - if the pointer has not yet been initialized, create an object for it
// - create a reference "_VAR" to the object
#define SCN_INITIALIZE_AND_REFERENCE(TYPE, VAR) \
if (VAR[0] == NULL) \
VAR[0] = (void *)new TYPE; \
TYPE &_##VAR = *(TYPE *)VAR[0];
// Macro to free the memory allocated by SCN_INITIALIZE_AND_REFERENCE
#define SCN_DELETE(TYPE, VAR) \
if (VAR[0] != NULL) { \
delete (TYPE *) VAR[0]; \
VAR[0] = NULL; \
}
uInt ruleBookMaxSize(RuleBook &rb) {
uInt m = 0;
for (auto &r : rb)
m = std::max(m, (uInt)r.size());
return m;
}
uInt ruleBookTotalSize(RuleBook &rb) {
uInt m = 0;
for (auto &r : rb)
m += (uInt)r.size();
return m;
}
#endif /* SPARSECONVNET_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment