Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
f9552033
Commit
f9552033
authored
Jul 16, 2017
by
Benjamin Thomas Graham
Browse files
initial commit
parents
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2175 additions
and
0 deletions
+2175
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+87
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+591
-0
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+36
-0
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+25
-0
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+59
-0
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+78
-0
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+141
-0
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+33
-0
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+58
-0
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+72
-0
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
.../sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+30
-0
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
...arseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+63
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
...h/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+35
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
...rch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+158
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+209
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+119
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
...h/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+121
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
...h/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
+61
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
...parseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+92
-0
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+107
-0
No files found.
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
#else
#include "Convolution.h"
#include "Deconvolution.h"
#include <algorithm>
extern
"C"
double
scn_DR_
(
Deconvolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
Deconvolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_DECONVOLUTION_H
#define GPU_DECONVOLUTION_H
#include "../SparseConvNet.h"
#include "Convolution.h"
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
64
,
16
)
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
#endif
/* GPU_DECONVOLUTION_H */
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
#else
#include "LeakyReLU.h"
extern
"C"
void
scn_R_
(
LeakyReLU_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
n
=
THCTensor_
(
nElement
)(
state
,
input_features
);
LeakyReLU_fp
<
real
>
<<
<
16
,
1024
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
output_features
),
n
,
alpha
);
}
extern
"C"
void
scn_R_
(
LeakyReLU_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
n
=
THCTensor_
(
nElement
)(
state
,
d_input_features
);
LeakyReLU_bp
<
real
>
<<
<
16
,
1024
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
d_input_features
),
THCTensor_
(
data
)(
state
,
d_output_features
),
n
,
alpha
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
n
,
T
alpha
)
{
for
(
uInt
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
n
,
T
alpha
)
{
for
(
uInt
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
#else
#include "MaxPooling.h"
#include "RuleBookIterator.h"
extern
"C"
void
scn_DR_
(
MaxPooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
nPlanes
);
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
MaxPooling_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
input_features
->
size
[
1
],
output_features
->
size
[
1
],
rbB
,
nHotB
);
,
)
}
extern
"C"
void
scn_DR_
(
MaxPooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
output_features
,
THCTensor
*
d_output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
MaxPooling_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
->
size
[
1
],
d_output_features
->
size
[
1
],
rbB
,
nHotB
);
,
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_MAXPOOLING_H
#define GPU_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
}
}
__syncthreads
();
}
}
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* GPU_MAXPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
#else
#include "Convolution.h"
#include <algorithm>
extern
"C"
double
scn_R_
(
NetworkInNetwork_updateOutput
)(
THCTensor
*
input_features_
,
THCTensor
*
output_features_
,
THCTensor
*
weight_
,
THCTensor
*
bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THCTensor_
(
resize2d
)(
state
,
output_features_
,
nActive
,
output_nPlanes
);
auto
input_features
=
THCTensor_
(
data
)(
state
,
input_features_
);
auto
output_features
=
THCTensor_
(
data
)(
state
,
output_features_
);
auto
weight
=
THCTensor_
(
data
)(
state
,
weight_
);
if
(
bias_
!=
nullptr
)
{
auto
bias
=
THCTensor_
(
data
)(
state
,
bias_
);
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
output_nPlanes
-
i
);
uInt
gridDim
=
min
(
4096L
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
output_features
+
i
,
bias
+
i
,
output_nPlanes
,
output_nPlanes
,
nActive
);
}
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is m*r (row-major)
// output_features is l*r (row-major)
// buffer * weights + bias -> output_features
THBLAS_GEMM
(
state
,
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
THCTensor_
(
zero
)(
state
,
output_features_
);
THBLAS_GEMM
(
state
,
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_updateGradInput
)(
THCTensor
*
d_input_features_
,
THCTensor
*
d_output_features_
,
THCTensor
*
weight_
)
{
auto
nActive
=
d_output_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THCTensor_
(
resize2d
)(
state
,
d_input_features_
,
nActive
,
input_nPlanes
);
THCTensor_
(
zero
)(
state
,
d_input_features_
);
auto
d_input_features
=
THCTensor_
(
data
)(
state
,
d_input_features_
);
auto
d_output_features
=
THCTensor_
(
data
)(
state
,
d_output_features_
);
auto
weight
=
THCTensor_
(
data
)(
state
,
weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is r*m (row-major)
// d_buffer is l*r (row-major)
// d_output_features * T(weight) -> d_buffer
THBLAS_GEMM
(
state
,
't'
,
'n'
,
input_nPlanes
,
// r
nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_input_features
,
input_nPlanes
// r
);
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_accGradParameters
)(
THCTensor
*
input_features_
,
THCTensor
*
d_output_features_
,
THCTensor
*
d_weight_
,
THCTensor
*
d_bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
d_weight_
->
size
[
0
];
auto
output_nPlanes
=
d_weight_
->
size
[
1
];
auto
input_features
=
THCTensor_
(
data
)(
state
,
input_features_
);
auto
d_output_features
=
THCTensor_
(
data
)(
state
,
d_output_features_
);
auto
d_weight
=
THCTensor_
(
data
)(
state
,
d_weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is m*l (row-major)
// d_output_features is m*r (row-major)
// weights is l*r (row-major)
// T(buffer) * d_output_features -> d_weight
THBLAS_GEMM
(
state
,
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
,
// l
nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias_
)
{
auto
d_bias
=
THCTensor_
(
data
)(
state
,
d_bias_
);
Convolution_bp_bias
(
d_output_features
,
d_bias
,
output_nPlanes
,
output_nPlanes
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_RULEBOOKITERATOR_H
#define GPU_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to GPU memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
if (THCITensor_nElement(state, rulesBuffer) < ms) \
THCITensor_resize1d(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
} \
if (nHotB) { \
X \
} \
Y \
}
#endif
/* GPU_RULEBOOKITERATOR_H */
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/SparseToDense.cu"
#else
#include "SparseToDense.h"
extern
"C"
void
scn_DR_
(
SparseToDense_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
{
long
sz
[
Dimension
+
2
];
sz
[
0
]
=
_m
.
inputSGs
->
size
();
sz
[
1
]
=
input_features
->
size
[
1
];
for
(
int
i
=
0
;
i
<
Dimension
;
i
++
)
{
auto
x
=
THLongTensor_data
(
inputSize
)[
i
];
sz
[
i
+
2
]
=
x
;
}
THCTensor_
(
resizeNd
)(
state
,
output_features
,
Dimension
+
2
,
sz
,
NULL
);
THCTensor_
(
zero
)(
state
,
output_features
);
}
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
++
;)
// todo check ++ or +=spatialVolume????zzz
}
extern
"C"
void
scn_DR_
(
SparseToDense_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCITensor
*
rulesBuffer
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
d_input_features
->
size
[
1
];
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
diF
,
doF
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
++
;)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
&
input_features
[
r
[
2
*
threadIdx
.
y
]
*
nPlanes
];
T
*
o
=
&
output_features
[
r
[
2
*
threadIdx
.
y
+
1
]
*
spatialVolume
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
&
d_input_features
[
r
[
2
*
threadIdx
.
y
]
*
nPlanes
];
T
*
o
=
&
d_output_features
[
r
[
2
*
threadIdx
.
y
+
1
]
*
spatialVolume
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
i
[
plane
]
=
o
[
plane
*
spatialVolume
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
#endif
/* GPU_SPARSETODENSE_H */
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error \
"You must define TH_GENERIC_FILE before including THGenerateCudaFloatTypes.h"
#endif
// float
#define real float
#define accreal double
#define Real Float
#define CReal Cuda
#define TH_REAL_IS_FLOAT
#define THBLAS_GEMM THCudaBlas_Sgemm
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef CReal
#undef TH_REAL_IS_FLOAT
#undef THBLAS_GEMM
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimCudaFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef ACTIVEPOOLING_H
#define ACTIVEPOOLING_H
#include "../SparseConvNet.h"
// Return the maximum number of active sites in the batch
// rules has size 1.
// rules[0] is a batchSize x (maxActive + 1) matrix.
// First column is number of active sites for that sample (<= maxActive)
// Remaining maxActive columns give the active sites, zero padded.
template
<
uInt
dimension
>
void
activePoolingRules
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
)
{
rules
.
clear
();
rules
.
resize
(
2
);
auto
&
r
=
rules
[
0
];
uInt
maxActive
=
0
;
for
(
auto
&
sg
:
SGs
)
maxActive
=
std
::
max
(
maxActive
,
(
uInt
)
sg
.
mp
.
size
());
for
(
auto
&
sg
:
SGs
)
{
r
.
push_back
(
sg
.
mp
.
size
());
for
(
auto
&
iter
:
sg
.
mp
)
r
.
push_back
(
sg
.
ctr
+
iter
.
second
);
while
(
rules
.
size
()
%
(
maxActive
+
1
)
!=
0
)
r
.
push_back
(
0
);
// padding
}
rules
[
1
].
push_back
(
SGs
.
size
());
rules
[
1
].
push_back
(
maxActive
);
}
#endif
/* ACTIVEPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CONVOLUTIONRULES_H
#define CONVOLUTIONRULES_H
#include "RectangularRegions.h"
template
<
uInt
dimension
>
void
Convolution_InputSgToRulesAndOutputSg
(
SparseGrid
<
dimension
>
&
inputGrid
,
SparseGrid
<
dimension
>
&
outputGrid
,
RuleBook
&
rules
,
long
*
size
,
long
*
stride
,
long
*
inputSpatialSize
,
long
*
outputSpatialSize
)
{
rules
.
resize
(
volume
<
dimension
>
(
size
));
for
(
auto
const
&
inIter
:
inputGrid
.
mp
)
{
for
(
auto
j
:
OutputRegionCalculator
<
dimension
>
(
inIter
.
first
,
size
,
stride
,
outputSpatialSize
))
{
auto
inRegion
=
InputRegionCalculator
<
dimension
>
(
j
,
size
,
stride
);
uInt
rulesOffset
=
inRegion
.
offset
(
inIter
.
first
);
auto
outIter
=
outputGrid
.
mp
.
find
(
j
);
if
(
outIter
==
outputGrid
.
mp
.
end
())
{
outIter
=
outputGrid
.
mp
.
insert
(
std
::
make_pair
(
j
,
outputGrid
.
ctr
++
)).
first
;
}
rules
[
rulesOffset
].
push_back
(
inIter
.
second
+
inputGrid
.
ctr
);
rules
[
rulesOffset
].
push_back
(
outIter
->
second
);
}
}
}
template
<
uInt
dimension
>
uInt
Convolution_InputSgsToRulesAndOutputSgs
(
SparseGrids
<
dimension
>
&
input_SGs
,
SparseGrids
<
dimension
>
&
output_SGs
,
RuleBook
&
rules
,
long
*
filterSize
,
long
*
filterStride
,
long
*
input_spatialSize
,
long
*
output_spatialSize
)
{
rules
.
clear
();
output_SGs
.
clear
();
uInt
batchSize
=
input_SGs
.
size
();
output_SGs
.
resize
(
batchSize
);
uInt
output_nActive
=
0
;
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
auto
&
iSG
=
input_SGs
[
i
];
auto
&
oSG
=
output_SGs
[
i
];
oSG
.
ctr
=
output_nActive
;
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
iSG
,
oSG
,
rules
,
filterSize
,
filterStride
,
input_spatialSize
,
output_spatialSize
);
output_nActive
=
oSG
.
ctr
;
oSG
.
ctr
=
0
;
}
return
output_nActive
;
}
template
<
uInt
dimension
>
uInt
Convolution_InputSgsToRulesAndOutputSgs_OMP
(
SparseGrids
<
dimension
>
&
input_SGs
,
SparseGrids
<
dimension
>
&
output_SGs
,
RuleBook
&
rules
,
long
*
filterSize
,
long
*
filterStride
,
long
*
input_spatialSize
,
long
*
output_spatialSize
)
{
rules
.
clear
();
rules
.
resize
(
volume
<
dimension
>
(
filterSize
));
output_SGs
.
clear
();
uInt
batchSize
=
input_SGs
.
size
();
output_SGs
.
resize
(
batchSize
);
std
::
vector
<
RuleBook
>
rbs
(
batchSize
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
batchSize
;
i
++
)
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
input_SGs
[
i
],
output_SGs
[
i
],
rbs
[
i
],
filterSize
,
filterStride
,
input_spatialSize
,
output_spatialSize
);
}
uInt
output_nActive
=
0
;
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
// Parallel assignment:
// output_nActive <- output_nActive+output_SGs[i].ctr
// output_SGs[i].ctr <- output_nActive
uInt
tmp
=
output_nActive
;
output_nActive
+=
output_SGs
[
i
].
ctr
;
output_SGs
[
i
].
ctr
=
tmp
;
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
rules
.
size
();
i
++
)
{
auto
&
R
=
rules
[
i
];
for
(
uInt
j
=
0
;
j
<
batchSize
;
j
++
)
{
auto
&
r
=
rbs
[
j
][
i
];
auto
offset
=
output_SGs
[
j
].
ctr
;
for
(
uInt
k
=
0
;
k
<
r
.
size
();)
{
R
.
push_back
(
r
[
k
++
]);
R
.
push_back
(
r
[
k
++
]
+
offset
);
}
}
}
}
return
output_nActive
;
}
// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs
template
<
uInt
dimension
>
void
SparseToDense_InputSgsToRulesAndOutputSgs
(
SparseGrids
<
dimension
>
&
input_SGs
,
RuleBook
&
rules
,
long
*
spatialSize
)
{
uInt
batchSize
=
input_SGs
.
size
();
SparseGrids
<
dimension
>
output_SGs
(
batchSize
);
std
::
vector
<
long
>
ones
(
dimension
,
1
);
rules
.
clear
();
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
auto
&
iSG
=
input_SGs
[
i
];
auto
&
oSG
=
output_SGs
[
i
];
oSG
.
ctr
=
i
;
// batchIdx
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
iSG
,
oSG
,
rules
,
spatialSize
,
&
ones
[
0
],
spatialSize
,
&
ones
[
0
]);
}
}
template
<
uInt
dimension
>
void
SparseToDense_InputSgsToRulesAndOutputSgs_OMP
(
SparseGrids
<
dimension
>
&
input_SGs
,
RuleBook
&
rules
,
long
*
spatialSize
)
{
uInt
batchSize
=
input_SGs
.
size
();
SparseGrids
<
dimension
>
output_SGs
(
batchSize
);
std
::
vector
<
long
>
ones
(
dimension
,
1
);
rules
.
clear
();
rules
.
resize
(
volume
<
dimension
>
(
spatialSize
));
std
::
vector
<
RuleBook
>
rbs
(
batchSize
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
batchSize
;
i
++
)
{
output_SGs
[
i
].
ctr
=
i
;
// batchIdx
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
input_SGs
[
i
],
output_SGs
[
i
],
rbs
[
i
],
spatialSize
,
&
ones
[
0
],
spatialSize
,
&
ones
[
0
]);
}
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
rules
.
size
();
i
++
)
{
auto
&
R
=
rules
[
i
];
for
(
uInt
j
=
0
;
j
<
batchSize
;
j
++
)
{
auto
&
r
=
rbs
[
j
][
i
];
for
(
uInt
k
=
0
;
k
<
r
.
size
();)
{
R
.
push_back
(
r
[
k
++
]);
R
.
push_back
(
r
[
k
++
]);
}
}
}
}
}
#endif
/* CONVOLUTIONRULES_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Geometry/Metadata.cpp"
#else
#include "Metadata.h"
#include <cstring>
extern
"C"
void
scn_D_
(
setInputSpatialSize
)(
void
**
m
,
THLongTensor
*
spatialSize
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
_m
.
setInputSpatialSize
(
spatialSize
);
}
extern
"C"
void
scn_D_
(
batchAddSample
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
assert
(
_m
.
inputSGs
&&
"Call setInputSpatialSize first, please!"
);
_m
.
inputSGs
->
resize
(
_m
.
inputSGs
->
size
()
+
1
);
_m
.
inputSG
=
&
_m
.
inputSGs
->
back
();
}
extern
"C"
void
scn_D_
(
setInputSpatialLocation
)(
void
**
m
,
THFloatTensor
*
features
,
THLongTensor
*
location
,
THFloatTensor
*
vec
,
bool
overwrite
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
p
=
LongTensorToPoint
<
Dimension
>
(
location
);
auto
&
mp
=
_m
.
inputSG
->
mp
;
auto
&
nActive
=
*
_m
.
inputNActive
;
auto
iter
=
mp
.
find
(
p
);
auto
nPlanes
=
vec
->
size
[
0
];
if
(
iter
==
mp
.
end
())
{
iter
=
mp
.
insert
(
std
::
make_pair
(
p
,
nActive
++
)).
first
;
THFloatTensor_resize2d
(
features
,
nActive
,
nPlanes
);
std
::
memcpy
(
THFloatTensor_data
(
features
)
+
(
nActive
-
1
)
*
nPlanes
,
THFloatTensor_data
(
vec
),
sizeof
(
float
)
*
nPlanes
);
}
else
if
(
overwrite
)
{
std
::
memcpy
(
THFloatTensor_data
(
features
)
+
iter
->
second
*
nPlanes
,
THFloatTensor_data
(
vec
),
sizeof
(
float
)
*
nPlanes
);
}
}
extern
"C"
void
scn_D_
(
createMetadataForDenseToSparse
)(
void
**
m
,
THLongTensor
*
spatialSize_
,
THLongTensor
*
pad_
,
THLongTensor
*
nz_
,
long
batchSize
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
_m
.
setInputSpatialSize
(
spatialSize_
);
_m
.
inputSGs
->
resize
(
batchSize
);
auto
&
nActive
=
*
_m
.
inputNActive
;
nActive
=
nz_
->
size
[
0
];
auto
nz
=
THLongTensor_data
(
nz_
);
auto
pad
=
THLongTensor_data
(
pad_
);
auto
spatialSize
=
THLongTensor_data
(
spatialSize_
);
std
::
vector
<
uInt
>
br
(
batchSize
+
1
);
if
(
batchSize
==
1
)
{
br
[
1
]
=
nActive
;
}
else
{
long
b
=
0
;
for
(
uInt
i
=
0
;
i
<
nActive
;
i
++
)
{
long
B
=
nz
[
i
*
(
Dimension
+
1
)];
for
(;
b
<
B
;)
br
[
++
b
]
=
i
;
}
for
(;
b
<
batchSize
;)
br
[
++
b
]
=
nActive
;
}
uInt
b
;
#pragma omp parallel for private(b)
for
(
b
=
0
;
b
<
batchSize
;
b
++
)
{
auto
&
sg
=
_m
.
inputSGs
->
at
(
b
);
for
(
uInt
i
=
br
[
b
];
i
<
br
[
b
+
1
];
i
++
)
{
Point
<
Dimension
>
x
;
for
(
uInt
j
=
0
;
j
<
Dimension
;
j
++
)
{
x
[
j
]
=
nz
[
i
*
(
Dimension
+
1
)
+
j
+
1
]
+
pad
[
b
*
Dimension
+
j
];
// 0-indexed
}
sg
.
mp
[
x
]
=
i
;
}
}
}
// tensor is size[0] x .. x size[Dimension-1] x size[Dimension]
// size[0] x .. x size[Dimension-1] == spatial volume
// size[Dimension] == #feature planes
extern
"C"
void
scn_D_
(
addSampleFromThresholdedTensor
)(
void
**
m
,
THFloatTensor
*
features_
,
THFloatTensor
*
tensor_
,
THLongTensor
*
offset_
,
THLongTensor
*
spatialSize_
,
float
threshold
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
&
nActive
=
*
_m
.
inputNActive
;
auto
&
SGs
=
*
_m
.
inputSGs
;
SGs
.
resize
(
SGs
.
size
()
+
1
);
auto
&
sg
=
SGs
.
back
();
auto
tensor
=
THFloatTensor_data
(
tensor_
);
auto
offset
=
THLongTensor_data
(
offset_
);
auto
spatialSize
=
THLongTensor_data
(
spatialSize_
);
long
*
size
=
tensor_
->
size
;
auto
nPlanes
=
size
[
Dimension
];
long
volume
=
1
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
volume
*=
size
[
i
];
THFloatTensor_resize2d
(
features_
,
nActive
+
volume
,
nPlanes
);
// Increment pointers as we work through the data
auto
features
=
THFloatTensor_data
(
features_
)
+
nActive
*
nPlanes
;
// Active locations
Point
<
Dimension
>
point
;
for
(
uInt
i
=
0
;
i
<
Dimension
;
i
++
)
point
[
i
]
=
offset
[
i
];
for
(
uInt
ctr
=
0
;
ctr
<
volume
;
ctr
++
)
{
bool
active
=
false
;
for
(
uInt
i
=
0
;
i
<
nPlanes
;
i
++
)
{
if
(
fabs
(
tensor
[
i
])
>
threshold
)
{
active
=
true
;
break
;
}
}
for
(
uInt
i
=
0
;
i
<
Dimension
;
i
++
)
{
if
(
point
[
i
]
<
0
or
point
[
i
]
>=
spatialSize
[
i
])
{
active
=
false
;
break
;
}
}
if
(
active
)
{
sg
.
mp
[
point
]
=
nActive
++
;
std
::
memcpy
(
features
,
tensor
,
sizeof
(
float
)
*
nPlanes
);
features
+=
nPlanes
;
}
tensor
+=
nPlanes
;
incrementPointInCube
<
Dimension
>
(
point
,
size
,
offset
);
}
THFloatTensor_resize2d
(
features_
,
nActive
,
nPlanes
);
}
// 3x3 valid convolutions, 3x3/2x2 pooling or strided convolutions
extern
"C"
void
scn_D_
(
generateRuleBooks3s2
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
long
sz
[
Dimension
],
str
[
Dimension
],
inS
[
Dimension
],
outS
[
Dimension
];
Point
<
Dimension
>
p1
;
Point
<
2
*
Dimension
>
p2
;
Point
<
3
*
Dimension
>
p3
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
{
p1
[
i
]
=
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
_m
.
inputSpatialSize
[
i
];
p2
[
i
+
Dimension
]
=
p3
[
i
+
Dimension
]
=
sz
[
i
]
=
3
;
p3
[
i
+
2
*
Dimension
]
=
str
[
i
]
=
2
;
}
while
(
true
)
{
auto
&
SGs
=
_m
.
grids
[
p1
];
auto
&
rb
=
_m
.
validRuleBooks
[
p2
];
if
(
rb
.
empty
())
ValidConvolution_SgsToRules
(
SGs
,
rb
,
sz
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
if
(
p1
[
i
]
<
3
or
p1
[
i
]
%
2
!=
1
)
return
;
else
p1
[
i
]
=
outS
[
i
]
=
(
inS
[
i
]
-
1
)
/
2
;
auto
&
SGs2
=
_m
.
grids
[
p1
];
auto
&
rb2
=
_m
.
ruleBooks
[
p3
];
if
(
rb2
.
empty
())
_m
.
nActive
[
p1
]
=
Convolution_InputSgsToRulesAndOutputSgs
(
SGs
,
SGs2
,
rb2
,
sz
,
str
,
inS
,
outS
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
outS
[
i
];
}
}
// 3x3 valid convolutions, 2x2 pooling or strided convolutions
extern
"C"
void
scn_D_
(
generateRuleBooks2s2
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
long
s2
[
Dimension
],
s3
[
Dimension
],
inS
[
Dimension
],
outS
[
Dimension
];
Point
<
Dimension
>
p1
;
Point
<
2
*
Dimension
>
p2
;
Point
<
3
*
Dimension
>
p3
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
{
p1
[
i
]
=
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
_m
.
inputSpatialSize
[
i
];
p2
[
i
+
Dimension
]
=
s3
[
i
]
=
3
;
p3
[
i
+
Dimension
]
=
p3
[
i
+
2
*
Dimension
]
=
s2
[
i
]
=
2
;
}
while
(
true
)
{
auto
&
SGs
=
_m
.
grids
[
p1
];
auto
&
rb
=
_m
.
validRuleBooks
[
p2
];
ValidConvolution_SgsToRules
(
SGs
,
rb
,
s3
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
if
(
p1
[
i
]
<
2
or
p1
[
i
]
%
2
!=
0
)
return
;
else
p1
[
i
]
=
outS
[
i
]
=
inS
[
i
]
/
2
;
auto
&
SGs2
=
_m
.
grids
[
p1
];
auto
&
rb2
=
_m
.
ruleBooks
[
p3
];
if
(
rb2
.
empty
())
_m
.
nActive
[
p1
]
=
Convolution_InputSgsToRulesAndOutputSgs
(
SGs
,
SGs2
,
rb2
,
s2
,
s2
,
inS
,
outS
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
outS
[
i
];
}
}
extern
"C"
void
scn_D_
(
freeMetadata
)(
void
**
m
)
{
SCN_DELETE
(
Metadata
<
Dimension
>
,
m
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef Metadata_H
#define Metadata_H
#include "../SparseConvNet.h"
#include "ActivePoolingRules.h"
#include "ConvolutionRules.h"
#include "ValidConvolutionRules.h"
#include <iostream>
#include <tuple>
#include <unordered_map>
template
<
uInt
dimension
>
class
Metadata
{
public:
std
::
unordered_map
<
Point
<
dimension
>
,
uInt
,
IntArrayHash
<
dimension
>>
nActive
;
std
::
unordered_map
<
Point
<
dimension
>
,
SparseGrids
<
dimension
>
,
IntArrayHash
<
dimension
>>
grids
;
std
::
unordered_map
<
Point
<
dimension
>
,
RuleBook
,
IntArrayHash
<
dimension
>>
activePoolingRuleBooks
;
std
::
unordered_map
<
Point
<
2
*
dimension
>
,
RuleBook
,
IntArrayHash
<
2
*
dimension
>>
validRuleBooks
;
std
::
unordered_map
<
Point
<
3
*
dimension
>
,
RuleBook
,
IntArrayHash
<
3
*
dimension
>>
ruleBooks
;
std
::
unordered_map
<
Point
<
dimension
>
,
RuleBook
,
IntArrayHash
<
dimension
>>
sparseToDenseRuleBooks
;
Point
<
dimension
>
inputSpatialSize
;
SparseGrids
<
dimension
>
*
inputSGs
;
SparseGrid
<
dimension
>
*
inputSG
;
uInt
*
inputNActive
;
Metadata
()
{}
void
setInputSpatialSize
(
THLongTensor
*
spatialSize
)
{
inputSpatialSize
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
inputSGs
=
&
grids
[
inputSpatialSize
];
inputNActive
=
&
nActive
[
inputSpatialSize
];
}
SparseGrids
<
dimension
>
&
getSparseGrid
(
THLongTensor
*
spatialSize
)
{
return
grids
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
};
uInt
getNActive
(
THLongTensor
*
spatialSize
)
{
return
nActive
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
};
RuleBook
&
getValidRuleBook
(
THLongTensor
*
spatialSize
,
THLongTensor
*
size
,
bool
openMP
)
{
auto
p
=
TwoLongTensorsToPoint
<
dimension
>
(
spatialSize
,
size
);
auto
&
rb
=
validRuleBooks
[
p
];
if
(
rb
.
empty
())
{
auto
&
SGs
=
grids
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
#if defined(ENABLE_OPENMP)
openMP
?
ValidConvolution_SgsToRules_OMP
(
SGs
,
rb
,
THLongTensor_data
(
size
))
:
#endif
ValidConvolution_SgsToRules
(
SGs
,
rb
,
THLongTensor_data
(
size
));
}
return
rb
;
}
RuleBook
&
getActivePoolingRuleBook
(
THLongTensor
*
spatialSize
)
{
auto
spatialSz
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
auto
&
SGs
=
grids
[
spatialSz
];
auto
&
rb
=
activePoolingRuleBooks
[
spatialSz
];
if
(
rb
.
empty
())
activePoolingRules
(
SGs
,
rb
);
return
rb
;
}
RuleBook
&
getSparseToDenseRuleBook
(
THLongTensor
*
spatialSize
,
bool
openMP
)
{
auto
ss
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
auto
&
SGs
=
grids
[
ss
];
auto
&
rb
=
sparseToDenseRuleBooks
[
ss
];
if
(
rb
.
empty
())
#if defined(ENABLE_OPENMP)
openMP
?
SparseToDense_InputSgsToRulesAndOutputSgs_OMP
(
SGs
,
rb
,
THLongTensor_data
(
spatialSize
))
:
#endif
SparseToDense_InputSgsToRulesAndOutputSgs
(
SGs
,
rb
,
THLongTensor_data
(
spatialSize
));
return
rb
;
}
RuleBook
&
getRuleBook
(
THLongTensor
*
inputSpatialSize
,
THLongTensor
*
outputSpatialSize
,
THLongTensor
*
size
,
THLongTensor
*
stride
,
bool
openMP
)
{
auto
p
=
ThreeLongTensorsToPoint
<
dimension
>
(
inputSpatialSize
,
size
,
stride
);
auto
&
rb
=
ruleBooks
[
p
];
if
(
rb
.
empty
())
{
auto
iS
=
LongTensorToPoint
<
dimension
>
(
inputSpatialSize
);
auto
oS
=
LongTensorToPoint
<
dimension
>
(
outputSpatialSize
);
auto
&
iSGs
=
grids
[
iS
];
auto
&
oSGs
=
grids
[
oS
];
nActive
[
oS
]
=
#if defined(ENABLE_OPENMP)
openMP
?
Convolution_InputSgsToRulesAndOutputSgs_OMP
(
iSGs
,
oSGs
,
rb
,
THLongTensor_data
(
size
),
THLongTensor_data
(
stride
),
THLongTensor_data
(
inputSpatialSize
),
THLongTensor_data
(
outputSpatialSize
))
:
#endif
Convolution_InputSgsToRulesAndOutputSgs
(
iSGs
,
oSGs
,
rb
,
THLongTensor_data
(
size
),
THLongTensor_data
(
stride
),
THLongTensor_data
(
inputSpatialSize
),
THLongTensor_data
(
outputSpatialSize
));
}
return
rb
;
}
};
#endif
PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef RECTANGULARREGIONS_H
#define RECTANGULARREGIONS_H
#include "../SparseConvNet.h"
// For iterating over the rectangular region with corners lb and ub.
// The .end() method and operator!= are designed to allow range based for
// loops of the region, but nothing else.
template
<
uInt
dimension
>
class
RectangularRegionIterator
;
template
<
uInt
dimension
>
class
RectangularRegion
{
public:
Point
<
dimension
>
lb
;
Point
<
dimension
>
ub
;
RectangularRegion
(
Point
<
dimension
>
&
lb
,
Point
<
dimension
>
&
ub
)
:
lb
(
lb
),
ub
(
ub
)
{}
RectangularRegionIterator
<
dimension
>
begin
()
{
return
RectangularRegionIterator
<
dimension
>
(
*
this
,
lb
);
}
RectangularRegionIterator
<
dimension
>
end
()
{
// Not really used by the custom operator!= function
// Otherwise it would need to represent a point just outside the region
return
RectangularRegionIterator
<
dimension
>
(
*
this
,
ub
);
}
uInt
offset
(
const
Point
<
dimension
>
&
p
)
{
// Enumerate the points inside the region
uInt
of
=
0
,
m
=
1
;
for
(
Int
i
=
dimension
-
1
;
i
>=
0
;
i
--
)
{
of
+=
m
*
(
p
[
i
]
-
lb
[
i
]);
m
*=
ub
[
i
]
-
lb
[
i
]
+
1
;
}
return
of
;
}
};
template
<
uInt
dimension
>
class
RectangularRegionIterator
{
private:
RectangularRegion
<
dimension
>
&
region
;
public:
bool
stillLooping
;
Point
<
dimension
>
point
;
RectangularRegionIterator
(
RectangularRegion
<
dimension
>
&
region
,
Point
<
dimension
>
&
point
)
:
region
(
region
),
point
(
point
),
stillLooping
(
true
)
{
// If stride > size, we can have lb[i]>ub[i] meaning region_size == 0
for
(
Int
i
=
0
;
i
<
dimension
;
i
++
)
if
(
point
[
i
]
>
region
.
ub
[
i
])
stillLooping
=
false
;
}
RectangularRegionIterator
<
dimension
>
&
operator
++
()
{
for
(
Int
i
=
dimension
-
1
;;)
{
point
[
i
]
++
;
if
(
point
[
i
]
<=
region
.
ub
[
i
])
break
;
point
[
i
]
=
region
.
lb
[
i
];
i
--
;
if
(
i
==
-
1
)
{
stillLooping
=
false
;
// Signal to operator!= to end iteration
break
;
}
}
return
*
this
;
}
Point
<
dimension
>
&
operator
*
()
{
return
point
;
}
};
// Only to be used for checking the end point of range based for loops.
template
<
uInt
dimension
>
inline
bool
operator
!=
(
const
RectangularRegionIterator
<
dimension
>
&
lhs
,
const
RectangularRegionIterator
<
dimension
>
&
rhs
)
{
return
lhs
.
stillLooping
;
}
// Similar to above but for [ offset[0] ... offset[0]+size[0]-1 ] x ... x [..]
template
<
uInt
dimension
>
void
incrementPointInCube
(
Point
<
dimension
>
&
point
,
long
*
size
,
long
*
offset
)
{
for
(
Int
i
=
dimension
-
1
;
i
>=
0
;
i
--
)
{
point
[
i
]
++
;
if
(
point
[
i
]
<
offset
[
i
]
+
size
[
i
])
break
;
point
[
i
]
=
offset
[
i
];
}
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the input field corresponding to a point in the output.
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
InputRegionCalculator
(
const
Point
<
dimension
>
&
output
,
long
*
size
,
long
*
stride
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
lb
[
i
]
=
output
[
i
]
*
stride
[
i
];
ub
[
i
]
=
output
[
i
]
*
stride
[
i
]
+
size
[
i
]
-
1
;
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the output field corresponding to a point in the input.
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
OutputRegionCalculator
(
const
Point
<
dimension
>
&
input
,
long
*
size
,
long
*
stride
,
long
*
outputSpatialSize
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
lb
[
i
]
=
std
::
max
(
0L
,
(
input
[
i
]
-
size
[
i
]
+
stride
[
i
])
/
stride
[
i
]);
ub
[
i
]
=
std
::
min
(
outputSpatialSize
[
i
]
-
1
,
input
[
i
]
/
stride
[
i
]);
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
#endif
/* RECTANGULARREGIONS_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateDimTypes.h"
#endif
#define Dimension 1
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 2
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 3
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 4
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 5
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 6
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 7
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 8
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 9
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 10
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef VALIDCONVOLUTIONRULES_H
#define VALIDCONVOLUTIONRULES_H
#include<iostream>
// Full input region for an output point
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
InputRegionCalculator_Valid
(
const
Point
<
dimension
>
&
output
,
long
*
size
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
Int
pad
=
size
[
i
]
/
2
;
lb
[
i
]
=
output
[
i
]
-
pad
;
ub
[
i
]
=
output
[
i
]
+
size
[
i
]
-
1
-
pad
;
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
// Call for each convolutional / max-pooling layer, once for each batch item.
// rules is used to carry out the "lowering" whilst carrying out the convolution
template
<
uInt
dimension
>
double
ValidConvolution_SgToRules
(
SparseGrid
<
dimension
>
&
grid
,
RuleBook
&
rules
,
long
*
size
)
{
uInt
sd
=
volume
<
dimension
>
(
size
);
double
countActiveInputs
=
0
;
for
(
auto
const
&
outputIter
:
grid
.
mp
)
{
auto
inRegion
=
InputRegionCalculator_Valid
<
dimension
>
(
outputIter
.
first
,
size
);
uInt
rulesOffset
=
0
;
for
(
auto
inputPoint
:
inRegion
)
{
auto
inputIter
=
grid
.
mp
.
find
(
inputPoint
);
if
(
inputIter
!=
grid
.
mp
.
end
())
{
rules
[
rulesOffset
].
push_back
(
inputIter
->
second
+
grid
.
ctr
);
rules
[
rulesOffset
].
push_back
(
outputIter
.
second
+
grid
.
ctr
);
countActiveInputs
++
;
}
rulesOffset
++
;
}
}
return
countActiveInputs
;
}
template
<
uInt
dimension
>
uInt
ValidConvolution_SgsToRules
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
,
long
*
size
)
{
uInt
sd
=
volume
<
dimension
>
(
size
);
uInt
countActiveInputs
=
0
;
rules
.
clear
();
rules
.
resize
(
sd
);
for
(
uInt
i
=
0
;
i
<
SGs
.
size
();
i
++
)
countActiveInputs
+=
ValidConvolution_SgToRules
<
dimension
>
(
SGs
[
i
],
rules
,
size
);
return
countActiveInputs
;
}
template
<
uInt
dimension
>
uInt
ValidConvolution_SgsToRules_OMP
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
,
long
*
size
)
{
std
::
vector
<
RuleBook
>
rbs
(
SGs
.
size
());
std
::
vector
<
double
>
countActiveInputs
(
SGs
.
size
());
rules
.
clear
();
uInt
sd
=
volume
<
dimension
>
(
size
);
rules
.
resize
(
sd
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
SGs
.
size
();
i
++
)
{
rbs
[
i
].
resize
(
sd
);
countActiveInputs
[
i
]
=
ValidConvolution_SgToRules
<
dimension
>
(
SGs
[
i
],
rbs
[
i
],
size
);
}
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
sd
;
i
++
)
for
(
auto
const
&
rb
:
rbs
)
rules
[
i
].
insert
(
rules
[
i
].
end
(),
rb
[
i
].
begin
(),
rb
[
i
].
end
());
}
uInt
countActiveInputs_
=
0
;
for
(
auto
&
i
:
countActiveInputs
)
countActiveInputs_
+=
i
;
return
countActiveInputs_
;
}
#endif
/* VALIDCONVOLUTIONRULES_H */
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef SPARSECONVNET_H
#define SPARSECONVNET_H
// To use 64 bits instead of 32, replace 32bits.h with 64bits.h
#include "32bits.h"
#include <array>
#include <cstdint>
#include <google/dense_hash_map>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#if defined(ENABLE_OPENMP)
#include <omp.h>
#endif
// Submanifold Sparse Convolutional Networks
// A batch of samples, for each layer of a sparse convolutional network, is
// encoded as a matrix of nActive x nFeatures and a vector of
// hash tables identifying points in space with the rows of
// the matrix.
// SparseGridMap<dimension> - a hash table assigning integer labels to a sparse
// collection of 'Point<dimension>' points
template
<
uInt
dimension
>
using
SparseGridMap
=
google
::
dense_hash_map
<
Point
<
dimension
>
,
int
,
IntArrayHash
<
dimension
>
,
std
::
equal_to
<
Point
<
dimension
>>>
;
template
<
uInt
dimension
>
class
SparseGrid
{
public:
uInt
ctr
;
// Count #active sites during output hash construction. Then store
// offset within a batch.
SparseGridMap
<
dimension
>
mp
;
SparseGrid
()
:
ctr
(
0
)
{
// Sparsehash needs a key to be set aside and never used - we use
// (Int_MAX,...,Int_MAX)
Point
<
dimension
>
empty_key
;
for
(
uInt
i
=
0
;
i
<
dimension
;
++
i
)
empty_key
[
i
]
=
Int_MAX
;
mp
.
set_empty_key
(
empty_key
);
}
};
template
<
uInt
dimension
>
using
SparseGrids
=
std
::
vector
<
SparseGrid
<
dimension
>>
;
// Each convolution/pooling operation requires the calculation of a 'rulebook'
// setting out how the output points depend on the points in the layer below
using
RuleBook
=
std
::
vector
<
std
::
vector
<
uInt
>>
;
// Code relating to squares/cubes/rectangles/cuboids etc
// integer powers - ok for filter sizes, could overflow if we calculate
// inputSpatialSize^d
template
<
uInt
m
>
uInt
ipow
(
uInt
n
)
{
return
n
*
ipow
<
m
-
1
>
(
n
);
}
template
<
>
uInt
ipow
<
1
>
(
uInt
n
)
{
return
n
;
}
template
<
>
uInt
ipow
<
0
>
(
uInt
n
)
{
return
1
;
}
template
<
uInt
dimension
>
uInt
volume
(
long
*
point
)
{
uInt
v
=
1
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
v
*=
point
[
i
];
return
v
;
}
// Macro to initialize arguments passed as void*[1] from Lua.
// This allows Lua to take ownership of arbitrary C++ objects.
// The macro:
// - takes a pointer to a pointer [allocated as ffi.new('void *[1]') in Lua]
// - if the pointer has not yet been initialized, create an object for it
// - create a reference "_VAR" to the object
#define SCN_INITIALIZE_AND_REFERENCE(TYPE, VAR) \
if (VAR[0] == NULL) \
VAR[0] = (void *)new TYPE; \
TYPE &_##VAR = *(TYPE *)VAR[0];
// Macro to free the memory allocated by SCN_INITIALIZE_AND_REFERENCE
#define SCN_DELETE(TYPE, VAR) \
if (VAR[0] != NULL) { \
delete (TYPE *) VAR[0]; \
VAR[0] = NULL; \
}
uInt
ruleBookMaxSize
(
RuleBook
&
rb
)
{
uInt
m
=
0
;
for
(
auto
&
r
:
rb
)
m
=
std
::
max
(
m
,
(
uInt
)
r
.
size
());
return
m
;
}
uInt
ruleBookTotalSize
(
RuleBook
&
rb
)
{
uInt
m
=
0
;
for
(
auto
&
r
:
rb
)
m
+=
(
uInt
)
r
.
size
();
return
m
;
}
#endif
/* SPARSECONVNET_H */
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment