Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
f9552033
"superbench/config/vscode:/vscode.git/clone" did not exist on "7f607e4f745b84fdac1c1b693b32bb65ca8a3c79"
Commit
f9552033
authored
Jul 16, 2017
by
Benjamin Thomas Graham
Browse files
initial commit
parents
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2175 additions
and
0 deletions
+2175
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+87
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+591
-0
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+36
-0
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+25
-0
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+59
-0
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+78
-0
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+141
-0
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+33
-0
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+58
-0
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+72
-0
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
.../sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+30
-0
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
...arseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+63
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
...h/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+35
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
...rch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+158
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+209
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+119
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
...h/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+121
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
...h/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
+61
-0
PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
...parseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+92
-0
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+107
-0
No files found.
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
#else
#include "Convolution.h"
#include "Deconvolution.h"
#include <algorithm>
extern
"C"
double
scn_DR_
(
Deconvolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
Deconvolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_DECONVOLUTION_H
#define GPU_DECONVOLUTION_H
#include "../SparseConvNet.h"
#include "Convolution.h"
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
64
,
16
)
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
#endif
/* GPU_DECONVOLUTION_H */
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
#else
#include "LeakyReLU.h"
extern
"C"
void
scn_R_
(
LeakyReLU_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
n
=
THCTensor_
(
nElement
)(
state
,
input_features
);
LeakyReLU_fp
<
real
>
<<
<
16
,
1024
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
output_features
),
n
,
alpha
);
}
extern
"C"
void
scn_R_
(
LeakyReLU_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
n
=
THCTensor_
(
nElement
)(
state
,
d_input_features
);
LeakyReLU_bp
<
real
>
<<
<
16
,
1024
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
d_input_features
),
THCTensor_
(
data
)(
state
,
d_output_features
),
n
,
alpha
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
n
,
T
alpha
)
{
for
(
uInt
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
n
,
T
alpha
)
{
for
(
uInt
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
#else
#include "MaxPooling.h"
#include "RuleBookIterator.h"
extern
"C"
void
scn_DR_
(
MaxPooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
nPlanes
);
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
MaxPooling_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
input_features
->
size
[
1
],
output_features
->
size
[
1
],
rbB
,
nHotB
);
,
)
}
extern
"C"
void
scn_DR_
(
MaxPooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
output_features
,
THCTensor
*
d_output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
MaxPooling_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
->
size
[
1
],
d_output_features
->
size
[
1
],
rbB
,
nHotB
);
,
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_MAXPOOLING_H
#define GPU_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
}
}
__syncthreads
();
}
}
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* GPU_MAXPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
#else
#include "Convolution.h"
#include <algorithm>
extern
"C"
double
scn_R_
(
NetworkInNetwork_updateOutput
)(
THCTensor
*
input_features_
,
THCTensor
*
output_features_
,
THCTensor
*
weight_
,
THCTensor
*
bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THCTensor_
(
resize2d
)(
state
,
output_features_
,
nActive
,
output_nPlanes
);
auto
input_features
=
THCTensor_
(
data
)(
state
,
input_features_
);
auto
output_features
=
THCTensor_
(
data
)(
state
,
output_features_
);
auto
weight
=
THCTensor_
(
data
)(
state
,
weight_
);
if
(
bias_
!=
nullptr
)
{
auto
bias
=
THCTensor_
(
data
)(
state
,
bias_
);
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
output_nPlanes
-
i
);
uInt
gridDim
=
min
(
4096L
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
output_features
+
i
,
bias
+
i
,
output_nPlanes
,
output_nPlanes
,
nActive
);
}
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is m*r (row-major)
// output_features is l*r (row-major)
// buffer * weights + bias -> output_features
THBLAS_GEMM
(
state
,
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
THCTensor_
(
zero
)(
state
,
output_features_
);
THBLAS_GEMM
(
state
,
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_updateGradInput
)(
THCTensor
*
d_input_features_
,
THCTensor
*
d_output_features_
,
THCTensor
*
weight_
)
{
auto
nActive
=
d_output_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THCTensor_
(
resize2d
)(
state
,
d_input_features_
,
nActive
,
input_nPlanes
);
THCTensor_
(
zero
)(
state
,
d_input_features_
);
auto
d_input_features
=
THCTensor_
(
data
)(
state
,
d_input_features_
);
auto
d_output_features
=
THCTensor_
(
data
)(
state
,
d_output_features_
);
auto
weight
=
THCTensor_
(
data
)(
state
,
weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is r*m (row-major)
// d_buffer is l*r (row-major)
// d_output_features * T(weight) -> d_buffer
THBLAS_GEMM
(
state
,
't'
,
'n'
,
input_nPlanes
,
// r
nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_input_features
,
input_nPlanes
// r
);
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_accGradParameters
)(
THCTensor
*
input_features_
,
THCTensor
*
d_output_features_
,
THCTensor
*
d_weight_
,
THCTensor
*
d_bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
d_weight_
->
size
[
0
];
auto
output_nPlanes
=
d_weight_
->
size
[
1
];
auto
input_features
=
THCTensor_
(
data
)(
state
,
input_features_
);
auto
d_output_features
=
THCTensor_
(
data
)(
state
,
d_output_features_
);
auto
d_weight
=
THCTensor_
(
data
)(
state
,
d_weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is m*l (row-major)
// d_output_features is m*r (row-major)
// weights is l*r (row-major)
// T(buffer) * d_output_features -> d_weight
THBLAS_GEMM
(
state
,
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
,
// l
nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias_
)
{
auto
d_bias
=
THCTensor_
(
data
)(
state
,
d_bias_
);
Convolution_bp_bias
(
d_output_features
,
d_bias
,
output_nPlanes
,
output_nPlanes
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_RULEBOOKITERATOR_H
#define GPU_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to GPU memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
if (THCITensor_nElement(state, rulesBuffer) < ms) \
THCITensor_resize1d(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
} \
if (nHotB) { \
X \
} \
Y \
}
#endif
/* GPU_RULEBOOKITERATOR_H */
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/SparseToDense.cu"
#else
#include "SparseToDense.h"
extern
"C"
void
scn_DR_
(
SparseToDense_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
{
long
sz
[
Dimension
+
2
];
sz
[
0
]
=
_m
.
inputSGs
->
size
();
sz
[
1
]
=
input_features
->
size
[
1
];
for
(
int
i
=
0
;
i
<
Dimension
;
i
++
)
{
auto
x
=
THLongTensor_data
(
inputSize
)[
i
];
sz
[
i
+
2
]
=
x
;
}
THCTensor_
(
resizeNd
)(
state
,
output_features
,
Dimension
+
2
,
sz
,
NULL
);
THCTensor_
(
zero
)(
state
,
output_features
);
}
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
++
;)
// todo check ++ or +=spatialVolume????zzz
}
extern
"C"
void
scn_DR_
(
SparseToDense_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCITensor
*
rulesBuffer
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
d_input_features
->
size
[
1
];
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
diF
,
doF
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
++
;)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
&
input_features
[
r
[
2
*
threadIdx
.
y
]
*
nPlanes
];
T
*
o
=
&
output_features
[
r
[
2
*
threadIdx
.
y
+
1
]
*
spatialVolume
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
&
d_input_features
[
r
[
2
*
threadIdx
.
y
]
*
nPlanes
];
T
*
o
=
&
d_output_features
[
r
[
2
*
threadIdx
.
y
+
1
]
*
spatialVolume
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
i
[
plane
]
=
o
[
plane
*
spatialVolume
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
#endif
/* GPU_SPARSETODENSE_H */
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error \
"You must define TH_GENERIC_FILE before including THGenerateCudaFloatTypes.h"
#endif
// float
#define real float
#define accreal double
#define Real Float
#define CReal Cuda
#define TH_REAL_IS_FLOAT
#define THBLAS_GEMM THCudaBlas_Sgemm
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef CReal
#undef TH_REAL_IS_FLOAT
#undef THBLAS_GEMM
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimCudaFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateCudaFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef ACTIVEPOOLING_H
#define ACTIVEPOOLING_H
#include "../SparseConvNet.h"
// Return the maximum number of active sites in the batch
// rules has size 1.
// rules[0] is a batchSize x (maxActive + 1) matrix.
// First column is number of active sites for that sample (<= maxActive)
// Remaining maxActive columns give the active sites, zero padded.
template
<
uInt
dimension
>
void
activePoolingRules
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
)
{
rules
.
clear
();
rules
.
resize
(
2
);
auto
&
r
=
rules
[
0
];
uInt
maxActive
=
0
;
for
(
auto
&
sg
:
SGs
)
maxActive
=
std
::
max
(
maxActive
,
(
uInt
)
sg
.
mp
.
size
());
for
(
auto
&
sg
:
SGs
)
{
r
.
push_back
(
sg
.
mp
.
size
());
for
(
auto
&
iter
:
sg
.
mp
)
r
.
push_back
(
sg
.
ctr
+
iter
.
second
);
while
(
rules
.
size
()
%
(
maxActive
+
1
)
!=
0
)
r
.
push_back
(
0
);
// padding
}
rules
[
1
].
push_back
(
SGs
.
size
());
rules
[
1
].
push_back
(
maxActive
);
}
#endif
/* ACTIVEPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CONVOLUTIONRULES_H
#define CONVOLUTIONRULES_H
#include "RectangularRegions.h"
template
<
uInt
dimension
>
void
Convolution_InputSgToRulesAndOutputSg
(
SparseGrid
<
dimension
>
&
inputGrid
,
SparseGrid
<
dimension
>
&
outputGrid
,
RuleBook
&
rules
,
long
*
size
,
long
*
stride
,
long
*
inputSpatialSize
,
long
*
outputSpatialSize
)
{
rules
.
resize
(
volume
<
dimension
>
(
size
));
for
(
auto
const
&
inIter
:
inputGrid
.
mp
)
{
for
(
auto
j
:
OutputRegionCalculator
<
dimension
>
(
inIter
.
first
,
size
,
stride
,
outputSpatialSize
))
{
auto
inRegion
=
InputRegionCalculator
<
dimension
>
(
j
,
size
,
stride
);
uInt
rulesOffset
=
inRegion
.
offset
(
inIter
.
first
);
auto
outIter
=
outputGrid
.
mp
.
find
(
j
);
if
(
outIter
==
outputGrid
.
mp
.
end
())
{
outIter
=
outputGrid
.
mp
.
insert
(
std
::
make_pair
(
j
,
outputGrid
.
ctr
++
)).
first
;
}
rules
[
rulesOffset
].
push_back
(
inIter
.
second
+
inputGrid
.
ctr
);
rules
[
rulesOffset
].
push_back
(
outIter
->
second
);
}
}
}
template
<
uInt
dimension
>
uInt
Convolution_InputSgsToRulesAndOutputSgs
(
SparseGrids
<
dimension
>
&
input_SGs
,
SparseGrids
<
dimension
>
&
output_SGs
,
RuleBook
&
rules
,
long
*
filterSize
,
long
*
filterStride
,
long
*
input_spatialSize
,
long
*
output_spatialSize
)
{
rules
.
clear
();
output_SGs
.
clear
();
uInt
batchSize
=
input_SGs
.
size
();
output_SGs
.
resize
(
batchSize
);
uInt
output_nActive
=
0
;
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
auto
&
iSG
=
input_SGs
[
i
];
auto
&
oSG
=
output_SGs
[
i
];
oSG
.
ctr
=
output_nActive
;
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
iSG
,
oSG
,
rules
,
filterSize
,
filterStride
,
input_spatialSize
,
output_spatialSize
);
output_nActive
=
oSG
.
ctr
;
oSG
.
ctr
=
0
;
}
return
output_nActive
;
}
template
<
uInt
dimension
>
uInt
Convolution_InputSgsToRulesAndOutputSgs_OMP
(
SparseGrids
<
dimension
>
&
input_SGs
,
SparseGrids
<
dimension
>
&
output_SGs
,
RuleBook
&
rules
,
long
*
filterSize
,
long
*
filterStride
,
long
*
input_spatialSize
,
long
*
output_spatialSize
)
{
rules
.
clear
();
rules
.
resize
(
volume
<
dimension
>
(
filterSize
));
output_SGs
.
clear
();
uInt
batchSize
=
input_SGs
.
size
();
output_SGs
.
resize
(
batchSize
);
std
::
vector
<
RuleBook
>
rbs
(
batchSize
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
batchSize
;
i
++
)
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
input_SGs
[
i
],
output_SGs
[
i
],
rbs
[
i
],
filterSize
,
filterStride
,
input_spatialSize
,
output_spatialSize
);
}
uInt
output_nActive
=
0
;
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
// Parallel assignment:
// output_nActive <- output_nActive+output_SGs[i].ctr
// output_SGs[i].ctr <- output_nActive
uInt
tmp
=
output_nActive
;
output_nActive
+=
output_SGs
[
i
].
ctr
;
output_SGs
[
i
].
ctr
=
tmp
;
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
rules
.
size
();
i
++
)
{
auto
&
R
=
rules
[
i
];
for
(
uInt
j
=
0
;
j
<
batchSize
;
j
++
)
{
auto
&
r
=
rbs
[
j
][
i
];
auto
offset
=
output_SGs
[
j
].
ctr
;
for
(
uInt
k
=
0
;
k
<
r
.
size
();)
{
R
.
push_back
(
r
[
k
++
]);
R
.
push_back
(
r
[
k
++
]
+
offset
);
}
}
}
}
return
output_nActive
;
}
// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs
template
<
uInt
dimension
>
void
SparseToDense_InputSgsToRulesAndOutputSgs
(
SparseGrids
<
dimension
>
&
input_SGs
,
RuleBook
&
rules
,
long
*
spatialSize
)
{
uInt
batchSize
=
input_SGs
.
size
();
SparseGrids
<
dimension
>
output_SGs
(
batchSize
);
std
::
vector
<
long
>
ones
(
dimension
,
1
);
rules
.
clear
();
for
(
uInt
i
=
0
;
i
<
batchSize
;
i
++
)
{
auto
&
iSG
=
input_SGs
[
i
];
auto
&
oSG
=
output_SGs
[
i
];
oSG
.
ctr
=
i
;
// batchIdx
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
iSG
,
oSG
,
rules
,
spatialSize
,
&
ones
[
0
],
spatialSize
,
&
ones
[
0
]);
}
}
template
<
uInt
dimension
>
void
SparseToDense_InputSgsToRulesAndOutputSgs_OMP
(
SparseGrids
<
dimension
>
&
input_SGs
,
RuleBook
&
rules
,
long
*
spatialSize
)
{
uInt
batchSize
=
input_SGs
.
size
();
SparseGrids
<
dimension
>
output_SGs
(
batchSize
);
std
::
vector
<
long
>
ones
(
dimension
,
1
);
rules
.
clear
();
rules
.
resize
(
volume
<
dimension
>
(
spatialSize
));
std
::
vector
<
RuleBook
>
rbs
(
batchSize
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
batchSize
;
i
++
)
{
output_SGs
[
i
].
ctr
=
i
;
// batchIdx
Convolution_InputSgToRulesAndOutputSg
<
dimension
>
(
input_SGs
[
i
],
output_SGs
[
i
],
rbs
[
i
],
spatialSize
,
&
ones
[
0
],
spatialSize
,
&
ones
[
0
]);
}
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
rules
.
size
();
i
++
)
{
auto
&
R
=
rules
[
i
];
for
(
uInt
j
=
0
;
j
<
batchSize
;
j
++
)
{
auto
&
r
=
rbs
[
j
][
i
];
for
(
uInt
k
=
0
;
k
<
r
.
size
();)
{
R
.
push_back
(
r
[
k
++
]);
R
.
push_back
(
r
[
k
++
]);
}
}
}
}
}
#endif
/* CONVOLUTIONRULES_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Geometry/Metadata.cpp"
#else
#include "Metadata.h"
#include <cstring>
extern
"C"
void
scn_D_
(
setInputSpatialSize
)(
void
**
m
,
THLongTensor
*
spatialSize
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
_m
.
setInputSpatialSize
(
spatialSize
);
}
extern
"C"
void
scn_D_
(
batchAddSample
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
assert
(
_m
.
inputSGs
&&
"Call setInputSpatialSize first, please!"
);
_m
.
inputSGs
->
resize
(
_m
.
inputSGs
->
size
()
+
1
);
_m
.
inputSG
=
&
_m
.
inputSGs
->
back
();
}
extern
"C"
void
scn_D_
(
setInputSpatialLocation
)(
void
**
m
,
THFloatTensor
*
features
,
THLongTensor
*
location
,
THFloatTensor
*
vec
,
bool
overwrite
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
p
=
LongTensorToPoint
<
Dimension
>
(
location
);
auto
&
mp
=
_m
.
inputSG
->
mp
;
auto
&
nActive
=
*
_m
.
inputNActive
;
auto
iter
=
mp
.
find
(
p
);
auto
nPlanes
=
vec
->
size
[
0
];
if
(
iter
==
mp
.
end
())
{
iter
=
mp
.
insert
(
std
::
make_pair
(
p
,
nActive
++
)).
first
;
THFloatTensor_resize2d
(
features
,
nActive
,
nPlanes
);
std
::
memcpy
(
THFloatTensor_data
(
features
)
+
(
nActive
-
1
)
*
nPlanes
,
THFloatTensor_data
(
vec
),
sizeof
(
float
)
*
nPlanes
);
}
else
if
(
overwrite
)
{
std
::
memcpy
(
THFloatTensor_data
(
features
)
+
iter
->
second
*
nPlanes
,
THFloatTensor_data
(
vec
),
sizeof
(
float
)
*
nPlanes
);
}
}
extern
"C"
void
scn_D_
(
createMetadataForDenseToSparse
)(
void
**
m
,
THLongTensor
*
spatialSize_
,
THLongTensor
*
pad_
,
THLongTensor
*
nz_
,
long
batchSize
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
_m
.
setInputSpatialSize
(
spatialSize_
);
_m
.
inputSGs
->
resize
(
batchSize
);
auto
&
nActive
=
*
_m
.
inputNActive
;
nActive
=
nz_
->
size
[
0
];
auto
nz
=
THLongTensor_data
(
nz_
);
auto
pad
=
THLongTensor_data
(
pad_
);
auto
spatialSize
=
THLongTensor_data
(
spatialSize_
);
std
::
vector
<
uInt
>
br
(
batchSize
+
1
);
if
(
batchSize
==
1
)
{
br
[
1
]
=
nActive
;
}
else
{
long
b
=
0
;
for
(
uInt
i
=
0
;
i
<
nActive
;
i
++
)
{
long
B
=
nz
[
i
*
(
Dimension
+
1
)];
for
(;
b
<
B
;)
br
[
++
b
]
=
i
;
}
for
(;
b
<
batchSize
;)
br
[
++
b
]
=
nActive
;
}
uInt
b
;
#pragma omp parallel for private(b)
for
(
b
=
0
;
b
<
batchSize
;
b
++
)
{
auto
&
sg
=
_m
.
inputSGs
->
at
(
b
);
for
(
uInt
i
=
br
[
b
];
i
<
br
[
b
+
1
];
i
++
)
{
Point
<
Dimension
>
x
;
for
(
uInt
j
=
0
;
j
<
Dimension
;
j
++
)
{
x
[
j
]
=
nz
[
i
*
(
Dimension
+
1
)
+
j
+
1
]
+
pad
[
b
*
Dimension
+
j
];
// 0-indexed
}
sg
.
mp
[
x
]
=
i
;
}
}
}
// tensor is size[0] x .. x size[Dimension-1] x size[Dimension]
// size[0] x .. x size[Dimension-1] == spatial volume
// size[Dimension] == #feature planes
extern
"C"
void
scn_D_
(
addSampleFromThresholdedTensor
)(
void
**
m
,
THFloatTensor
*
features_
,
THFloatTensor
*
tensor_
,
THLongTensor
*
offset_
,
THLongTensor
*
spatialSize_
,
float
threshold
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
&
nActive
=
*
_m
.
inputNActive
;
auto
&
SGs
=
*
_m
.
inputSGs
;
SGs
.
resize
(
SGs
.
size
()
+
1
);
auto
&
sg
=
SGs
.
back
();
auto
tensor
=
THFloatTensor_data
(
tensor_
);
auto
offset
=
THLongTensor_data
(
offset_
);
auto
spatialSize
=
THLongTensor_data
(
spatialSize_
);
long
*
size
=
tensor_
->
size
;
auto
nPlanes
=
size
[
Dimension
];
long
volume
=
1
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
volume
*=
size
[
i
];
THFloatTensor_resize2d
(
features_
,
nActive
+
volume
,
nPlanes
);
// Increment pointers as we work through the data
auto
features
=
THFloatTensor_data
(
features_
)
+
nActive
*
nPlanes
;
// Active locations
Point
<
Dimension
>
point
;
for
(
uInt
i
=
0
;
i
<
Dimension
;
i
++
)
point
[
i
]
=
offset
[
i
];
for
(
uInt
ctr
=
0
;
ctr
<
volume
;
ctr
++
)
{
bool
active
=
false
;
for
(
uInt
i
=
0
;
i
<
nPlanes
;
i
++
)
{
if
(
fabs
(
tensor
[
i
])
>
threshold
)
{
active
=
true
;
break
;
}
}
for
(
uInt
i
=
0
;
i
<
Dimension
;
i
++
)
{
if
(
point
[
i
]
<
0
or
point
[
i
]
>=
spatialSize
[
i
])
{
active
=
false
;
break
;
}
}
if
(
active
)
{
sg
.
mp
[
point
]
=
nActive
++
;
std
::
memcpy
(
features
,
tensor
,
sizeof
(
float
)
*
nPlanes
);
features
+=
nPlanes
;
}
tensor
+=
nPlanes
;
incrementPointInCube
<
Dimension
>
(
point
,
size
,
offset
);
}
THFloatTensor_resize2d
(
features_
,
nActive
,
nPlanes
);
}
// 3x3 valid convolutions, 3x3/2x2 pooling or strided convolutions
extern
"C"
void
scn_D_
(
generateRuleBooks3s2
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
long
sz
[
Dimension
],
str
[
Dimension
],
inS
[
Dimension
],
outS
[
Dimension
];
Point
<
Dimension
>
p1
;
Point
<
2
*
Dimension
>
p2
;
Point
<
3
*
Dimension
>
p3
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
{
p1
[
i
]
=
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
_m
.
inputSpatialSize
[
i
];
p2
[
i
+
Dimension
]
=
p3
[
i
+
Dimension
]
=
sz
[
i
]
=
3
;
p3
[
i
+
2
*
Dimension
]
=
str
[
i
]
=
2
;
}
while
(
true
)
{
auto
&
SGs
=
_m
.
grids
[
p1
];
auto
&
rb
=
_m
.
validRuleBooks
[
p2
];
if
(
rb
.
empty
())
ValidConvolution_SgsToRules
(
SGs
,
rb
,
sz
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
if
(
p1
[
i
]
<
3
or
p1
[
i
]
%
2
!=
1
)
return
;
else
p1
[
i
]
=
outS
[
i
]
=
(
inS
[
i
]
-
1
)
/
2
;
auto
&
SGs2
=
_m
.
grids
[
p1
];
auto
&
rb2
=
_m
.
ruleBooks
[
p3
];
if
(
rb2
.
empty
())
_m
.
nActive
[
p1
]
=
Convolution_InputSgsToRulesAndOutputSgs
(
SGs
,
SGs2
,
rb2
,
sz
,
str
,
inS
,
outS
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
outS
[
i
];
}
}
// 3x3 valid convolutions, 2x2 pooling or strided convolutions
extern
"C"
void
scn_D_
(
generateRuleBooks2s2
)(
void
**
m
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
long
s2
[
Dimension
],
s3
[
Dimension
],
inS
[
Dimension
],
outS
[
Dimension
];
Point
<
Dimension
>
p1
;
Point
<
2
*
Dimension
>
p2
;
Point
<
3
*
Dimension
>
p3
;
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
{
p1
[
i
]
=
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
_m
.
inputSpatialSize
[
i
];
p2
[
i
+
Dimension
]
=
s3
[
i
]
=
3
;
p3
[
i
+
Dimension
]
=
p3
[
i
+
2
*
Dimension
]
=
s2
[
i
]
=
2
;
}
while
(
true
)
{
auto
&
SGs
=
_m
.
grids
[
p1
];
auto
&
rb
=
_m
.
validRuleBooks
[
p2
];
ValidConvolution_SgsToRules
(
SGs
,
rb
,
s3
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
if
(
p1
[
i
]
<
2
or
p1
[
i
]
%
2
!=
0
)
return
;
else
p1
[
i
]
=
outS
[
i
]
=
inS
[
i
]
/
2
;
auto
&
SGs2
=
_m
.
grids
[
p1
];
auto
&
rb2
=
_m
.
ruleBooks
[
p3
];
if
(
rb2
.
empty
())
_m
.
nActive
[
p1
]
=
Convolution_InputSgsToRulesAndOutputSgs
(
SGs
,
SGs2
,
rb2
,
s2
,
s2
,
inS
,
outS
);
for
(
int
i
=
0
;
i
<
Dimension
;
++
i
)
p2
[
i
]
=
p3
[
i
]
=
inS
[
i
]
=
outS
[
i
];
}
}
extern
"C"
void
scn_D_
(
freeMetadata
)(
void
**
m
)
{
SCN_DELETE
(
Metadata
<
Dimension
>
,
m
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef Metadata_H
#define Metadata_H
#include "../SparseConvNet.h"
#include "ActivePoolingRules.h"
#include "ConvolutionRules.h"
#include "ValidConvolutionRules.h"
#include <iostream>
#include <tuple>
#include <unordered_map>
template
<
uInt
dimension
>
class
Metadata
{
public:
std
::
unordered_map
<
Point
<
dimension
>
,
uInt
,
IntArrayHash
<
dimension
>>
nActive
;
std
::
unordered_map
<
Point
<
dimension
>
,
SparseGrids
<
dimension
>
,
IntArrayHash
<
dimension
>>
grids
;
std
::
unordered_map
<
Point
<
dimension
>
,
RuleBook
,
IntArrayHash
<
dimension
>>
activePoolingRuleBooks
;
std
::
unordered_map
<
Point
<
2
*
dimension
>
,
RuleBook
,
IntArrayHash
<
2
*
dimension
>>
validRuleBooks
;
std
::
unordered_map
<
Point
<
3
*
dimension
>
,
RuleBook
,
IntArrayHash
<
3
*
dimension
>>
ruleBooks
;
std
::
unordered_map
<
Point
<
dimension
>
,
RuleBook
,
IntArrayHash
<
dimension
>>
sparseToDenseRuleBooks
;
Point
<
dimension
>
inputSpatialSize
;
SparseGrids
<
dimension
>
*
inputSGs
;
SparseGrid
<
dimension
>
*
inputSG
;
uInt
*
inputNActive
;
Metadata
()
{}
void
setInputSpatialSize
(
THLongTensor
*
spatialSize
)
{
inputSpatialSize
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
inputSGs
=
&
grids
[
inputSpatialSize
];
inputNActive
=
&
nActive
[
inputSpatialSize
];
}
SparseGrids
<
dimension
>
&
getSparseGrid
(
THLongTensor
*
spatialSize
)
{
return
grids
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
};
uInt
getNActive
(
THLongTensor
*
spatialSize
)
{
return
nActive
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
};
RuleBook
&
getValidRuleBook
(
THLongTensor
*
spatialSize
,
THLongTensor
*
size
,
bool
openMP
)
{
auto
p
=
TwoLongTensorsToPoint
<
dimension
>
(
spatialSize
,
size
);
auto
&
rb
=
validRuleBooks
[
p
];
if
(
rb
.
empty
())
{
auto
&
SGs
=
grids
[
LongTensorToPoint
<
dimension
>
(
spatialSize
)];
#if defined(ENABLE_OPENMP)
openMP
?
ValidConvolution_SgsToRules_OMP
(
SGs
,
rb
,
THLongTensor_data
(
size
))
:
#endif
ValidConvolution_SgsToRules
(
SGs
,
rb
,
THLongTensor_data
(
size
));
}
return
rb
;
}
RuleBook
&
getActivePoolingRuleBook
(
THLongTensor
*
spatialSize
)
{
auto
spatialSz
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
auto
&
SGs
=
grids
[
spatialSz
];
auto
&
rb
=
activePoolingRuleBooks
[
spatialSz
];
if
(
rb
.
empty
())
activePoolingRules
(
SGs
,
rb
);
return
rb
;
}
RuleBook
&
getSparseToDenseRuleBook
(
THLongTensor
*
spatialSize
,
bool
openMP
)
{
auto
ss
=
LongTensorToPoint
<
dimension
>
(
spatialSize
);
auto
&
SGs
=
grids
[
ss
];
auto
&
rb
=
sparseToDenseRuleBooks
[
ss
];
if
(
rb
.
empty
())
#if defined(ENABLE_OPENMP)
openMP
?
SparseToDense_InputSgsToRulesAndOutputSgs_OMP
(
SGs
,
rb
,
THLongTensor_data
(
spatialSize
))
:
#endif
SparseToDense_InputSgsToRulesAndOutputSgs
(
SGs
,
rb
,
THLongTensor_data
(
spatialSize
));
return
rb
;
}
RuleBook
&
getRuleBook
(
THLongTensor
*
inputSpatialSize
,
THLongTensor
*
outputSpatialSize
,
THLongTensor
*
size
,
THLongTensor
*
stride
,
bool
openMP
)
{
auto
p
=
ThreeLongTensorsToPoint
<
dimension
>
(
inputSpatialSize
,
size
,
stride
);
auto
&
rb
=
ruleBooks
[
p
];
if
(
rb
.
empty
())
{
auto
iS
=
LongTensorToPoint
<
dimension
>
(
inputSpatialSize
);
auto
oS
=
LongTensorToPoint
<
dimension
>
(
outputSpatialSize
);
auto
&
iSGs
=
grids
[
iS
];
auto
&
oSGs
=
grids
[
oS
];
nActive
[
oS
]
=
#if defined(ENABLE_OPENMP)
openMP
?
Convolution_InputSgsToRulesAndOutputSgs_OMP
(
iSGs
,
oSGs
,
rb
,
THLongTensor_data
(
size
),
THLongTensor_data
(
stride
),
THLongTensor_data
(
inputSpatialSize
),
THLongTensor_data
(
outputSpatialSize
))
:
#endif
Convolution_InputSgsToRulesAndOutputSgs
(
iSGs
,
oSGs
,
rb
,
THLongTensor_data
(
size
),
THLongTensor_data
(
stride
),
THLongTensor_data
(
inputSpatialSize
),
THLongTensor_data
(
outputSpatialSize
));
}
return
rb
;
}
};
#endif
PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef RECTANGULARREGIONS_H
#define RECTANGULARREGIONS_H
#include "../SparseConvNet.h"
// For iterating over the rectangular region with corners lb and ub.
// The .end() method and operator!= are designed to allow range based for
// loops of the region, but nothing else.
template
<
uInt
dimension
>
class
RectangularRegionIterator
;
template
<
uInt
dimension
>
class
RectangularRegion
{
public:
Point
<
dimension
>
lb
;
Point
<
dimension
>
ub
;
RectangularRegion
(
Point
<
dimension
>
&
lb
,
Point
<
dimension
>
&
ub
)
:
lb
(
lb
),
ub
(
ub
)
{}
RectangularRegionIterator
<
dimension
>
begin
()
{
return
RectangularRegionIterator
<
dimension
>
(
*
this
,
lb
);
}
RectangularRegionIterator
<
dimension
>
end
()
{
// Not really used by the custom operator!= function
// Otherwise it would need to represent a point just outside the region
return
RectangularRegionIterator
<
dimension
>
(
*
this
,
ub
);
}
uInt
offset
(
const
Point
<
dimension
>
&
p
)
{
// Enumerate the points inside the region
uInt
of
=
0
,
m
=
1
;
for
(
Int
i
=
dimension
-
1
;
i
>=
0
;
i
--
)
{
of
+=
m
*
(
p
[
i
]
-
lb
[
i
]);
m
*=
ub
[
i
]
-
lb
[
i
]
+
1
;
}
return
of
;
}
};
template
<
uInt
dimension
>
class
RectangularRegionIterator
{
private:
RectangularRegion
<
dimension
>
&
region
;
public:
bool
stillLooping
;
Point
<
dimension
>
point
;
RectangularRegionIterator
(
RectangularRegion
<
dimension
>
&
region
,
Point
<
dimension
>
&
point
)
:
region
(
region
),
point
(
point
),
stillLooping
(
true
)
{
// If stride > size, we can have lb[i]>ub[i] meaning region_size == 0
for
(
Int
i
=
0
;
i
<
dimension
;
i
++
)
if
(
point
[
i
]
>
region
.
ub
[
i
])
stillLooping
=
false
;
}
RectangularRegionIterator
<
dimension
>
&
operator
++
()
{
for
(
Int
i
=
dimension
-
1
;;)
{
point
[
i
]
++
;
if
(
point
[
i
]
<=
region
.
ub
[
i
])
break
;
point
[
i
]
=
region
.
lb
[
i
];
i
--
;
if
(
i
==
-
1
)
{
stillLooping
=
false
;
// Signal to operator!= to end iteration
break
;
}
}
return
*
this
;
}
Point
<
dimension
>
&
operator
*
()
{
return
point
;
}
};
// Only to be used for checking the end point of range based for loops.
template
<
uInt
dimension
>
inline
bool
operator
!=
(
const
RectangularRegionIterator
<
dimension
>
&
lhs
,
const
RectangularRegionIterator
<
dimension
>
&
rhs
)
{
return
lhs
.
stillLooping
;
}
// Similar to above but for [ offset[0] ... offset[0]+size[0]-1 ] x ... x [..]
template
<
uInt
dimension
>
void
incrementPointInCube
(
Point
<
dimension
>
&
point
,
long
*
size
,
long
*
offset
)
{
for
(
Int
i
=
dimension
-
1
;
i
>=
0
;
i
--
)
{
point
[
i
]
++
;
if
(
point
[
i
]
<
offset
[
i
]
+
size
[
i
])
break
;
point
[
i
]
=
offset
[
i
];
}
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the input field corresponding to a point in the output.
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
InputRegionCalculator
(
const
Point
<
dimension
>
&
output
,
long
*
size
,
long
*
stride
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
lb
[
i
]
=
output
[
i
]
*
stride
[
i
];
ub
[
i
]
=
output
[
i
]
*
stride
[
i
]
+
size
[
i
]
-
1
;
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
// For a convolutional layer with given filter *size* and *stride*, find the
// subset of the output field corresponding to a point in the input.
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
OutputRegionCalculator
(
const
Point
<
dimension
>
&
input
,
long
*
size
,
long
*
stride
,
long
*
outputSpatialSize
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
lb
[
i
]
=
std
::
max
(
0L
,
(
input
[
i
]
-
size
[
i
]
+
stride
[
i
])
/
stride
[
i
]);
ub
[
i
]
=
std
::
min
(
outputSpatialSize
[
i
]
-
1
,
input
[
i
]
/
stride
[
i
]);
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
#endif
/* RECTANGULARREGIONS_H */
PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateDimTypes.h"
#endif
#define Dimension 1
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 2
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 3
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 4
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 5
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 6
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 7
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 8
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 9
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#define Dimension 10
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef Dimension
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef VALIDCONVOLUTIONRULES_H
#define VALIDCONVOLUTIONRULES_H
#include<iostream>
// Full input region for an output point
template
<
uInt
dimension
>
RectangularRegion
<
dimension
>
InputRegionCalculator_Valid
(
const
Point
<
dimension
>
&
output
,
long
*
size
)
{
Point
<
dimension
>
lb
,
ub
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
{
Int
pad
=
size
[
i
]
/
2
;
lb
[
i
]
=
output
[
i
]
-
pad
;
ub
[
i
]
=
output
[
i
]
+
size
[
i
]
-
1
-
pad
;
}
return
RectangularRegion
<
dimension
>
(
lb
,
ub
);
}
// Call for each convolutional / max-pooling layer, once for each batch item.
// rules is used to carry out the "lowering" whilst carrying out the convolution
template
<
uInt
dimension
>
double
ValidConvolution_SgToRules
(
SparseGrid
<
dimension
>
&
grid
,
RuleBook
&
rules
,
long
*
size
)
{
uInt
sd
=
volume
<
dimension
>
(
size
);
double
countActiveInputs
=
0
;
for
(
auto
const
&
outputIter
:
grid
.
mp
)
{
auto
inRegion
=
InputRegionCalculator_Valid
<
dimension
>
(
outputIter
.
first
,
size
);
uInt
rulesOffset
=
0
;
for
(
auto
inputPoint
:
inRegion
)
{
auto
inputIter
=
grid
.
mp
.
find
(
inputPoint
);
if
(
inputIter
!=
grid
.
mp
.
end
())
{
rules
[
rulesOffset
].
push_back
(
inputIter
->
second
+
grid
.
ctr
);
rules
[
rulesOffset
].
push_back
(
outputIter
.
second
+
grid
.
ctr
);
countActiveInputs
++
;
}
rulesOffset
++
;
}
}
return
countActiveInputs
;
}
template
<
uInt
dimension
>
uInt
ValidConvolution_SgsToRules
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
,
long
*
size
)
{
uInt
sd
=
volume
<
dimension
>
(
size
);
uInt
countActiveInputs
=
0
;
rules
.
clear
();
rules
.
resize
(
sd
);
for
(
uInt
i
=
0
;
i
<
SGs
.
size
();
i
++
)
countActiveInputs
+=
ValidConvolution_SgToRules
<
dimension
>
(
SGs
[
i
],
rules
,
size
);
return
countActiveInputs
;
}
template
<
uInt
dimension
>
uInt
ValidConvolution_SgsToRules_OMP
(
SparseGrids
<
dimension
>
&
SGs
,
RuleBook
&
rules
,
long
*
size
)
{
std
::
vector
<
RuleBook
>
rbs
(
SGs
.
size
());
std
::
vector
<
double
>
countActiveInputs
(
SGs
.
size
());
rules
.
clear
();
uInt
sd
=
volume
<
dimension
>
(
size
);
rules
.
resize
(
sd
);
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
SGs
.
size
();
i
++
)
{
rbs
[
i
].
resize
(
sd
);
countActiveInputs
[
i
]
=
ValidConvolution_SgToRules
<
dimension
>
(
SGs
[
i
],
rbs
[
i
],
size
);
}
}
{
uInt
i
;
#pragma omp parallel for private(i)
for
(
i
=
0
;
i
<
sd
;
i
++
)
for
(
auto
const
&
rb
:
rbs
)
rules
[
i
].
insert
(
rules
[
i
].
end
(),
rb
[
i
].
begin
(),
rb
[
i
].
end
());
}
uInt
countActiveInputs_
=
0
;
for
(
auto
&
i
:
countActiveInputs
)
countActiveInputs_
+=
i
;
return
countActiveInputs_
;
}
#endif
/* VALIDCONVOLUTIONRULES_H */
PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef SPARSECONVNET_H
#define SPARSECONVNET_H
// To use 64 bits instead of 32, replace 32bits.h with 64bits.h
#include "32bits.h"
#include <array>
#include <cstdint>
#include <google/dense_hash_map>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#if defined(ENABLE_OPENMP)
#include <omp.h>
#endif
// Submanifold Sparse Convolutional Networks
// A batch of samples, for each layer of a sparse convolutional network, is
// encoded as a matrix of nActive x nFeatures and a vector of
// hash tables identifying points in space with the rows of
// the matrix.
// SparseGridMap<dimension> - a hash table assigning integer labels to a sparse
// collection of 'Point<dimension>' points
template
<
uInt
dimension
>
using
SparseGridMap
=
google
::
dense_hash_map
<
Point
<
dimension
>
,
int
,
IntArrayHash
<
dimension
>
,
std
::
equal_to
<
Point
<
dimension
>>>
;
template
<
uInt
dimension
>
class
SparseGrid
{
public:
uInt
ctr
;
// Count #active sites during output hash construction. Then store
// offset within a batch.
SparseGridMap
<
dimension
>
mp
;
SparseGrid
()
:
ctr
(
0
)
{
// Sparsehash needs a key to be set aside and never used - we use
// (Int_MAX,...,Int_MAX)
Point
<
dimension
>
empty_key
;
for
(
uInt
i
=
0
;
i
<
dimension
;
++
i
)
empty_key
[
i
]
=
Int_MAX
;
mp
.
set_empty_key
(
empty_key
);
}
};
template
<
uInt
dimension
>
using
SparseGrids
=
std
::
vector
<
SparseGrid
<
dimension
>>
;
// Each convolution/pooling operation requires the calculation of a 'rulebook'
// setting out how the output points depend on the points in the layer below
using
RuleBook
=
std
::
vector
<
std
::
vector
<
uInt
>>
;
// Code relating to squares/cubes/rectangles/cuboids etc
// integer powers - ok for filter sizes, could overflow if we calculate
// inputSpatialSize^d
template
<
uInt
m
>
uInt
ipow
(
uInt
n
)
{
return
n
*
ipow
<
m
-
1
>
(
n
);
}
template
<
>
uInt
ipow
<
1
>
(
uInt
n
)
{
return
n
;
}
template
<
>
uInt
ipow
<
0
>
(
uInt
n
)
{
return
1
;
}
template
<
uInt
dimension
>
uInt
volume
(
long
*
point
)
{
uInt
v
=
1
;
for
(
uInt
i
=
0
;
i
<
dimension
;
i
++
)
v
*=
point
[
i
];
return
v
;
}
// Macro to initialize arguments passed as void*[1] from Lua.
// This allows Lua to take ownership of arbitrary C++ objects.
// The macro:
// - takes a pointer to a pointer [allocated as ffi.new('void *[1]') in Lua]
// - if the pointer has not yet been initialized, create an object for it
// - create a reference "_VAR" to the object
#define SCN_INITIALIZE_AND_REFERENCE(TYPE, VAR) \
if (VAR[0] == NULL) \
VAR[0] = (void *)new TYPE; \
TYPE &_##VAR = *(TYPE *)VAR[0];
// Macro to free the memory allocated by SCN_INITIALIZE_AND_REFERENCE
#define SCN_DELETE(TYPE, VAR) \
if (VAR[0] != NULL) { \
delete (TYPE *) VAR[0]; \
VAR[0] = NULL; \
}
uInt
ruleBookMaxSize
(
RuleBook
&
rb
)
{
uInt
m
=
0
;
for
(
auto
&
r
:
rb
)
m
=
std
::
max
(
m
,
(
uInt
)
r
.
size
());
return
m
;
}
uInt
ruleBookTotalSize
(
RuleBook
&
rb
)
{
uInt
m
=
0
;
for
(
auto
&
r
:
rb
)
m
+=
(
uInt
)
r
.
size
();
return
m
;
}
#endif
/* SPARSECONVNET_H */
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment