Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
2c4ed608
"tools/vscode:/vscode.git/clone" did not exist on "6bd8be71a2d7346fce048df9878eb10e37a3869b"
Commit
2c4ed608
authored
Jun 20, 2018
by
Benjamin Thomas Graham
Browse files
Goodbye THNN. Hello ATen!
parent
6d4475db
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1648 additions
and
307 deletions
+1648
-307
sparseconvnet/SCN/CUDA/BatchNormalization.cu
sparseconvnet/SCN/CUDA/BatchNormalization.cu
+94
-0
sparseconvnet/SCN/CUDA/BatchNormalization.h
sparseconvnet/SCN/CUDA/BatchNormalization.h
+32
-35
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+69
-0
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+19
-19
sparseconvnet/SCN/CUDA/Convolution.cu
sparseconvnet/SCN/CUDA/Convolution.cu
+318
-0
sparseconvnet/SCN/CUDA/Convolution.h
sparseconvnet/SCN/CUDA/Convolution.h
+122
-125
sparseconvnet/SCN/CUDA/Deconvolution.cu
sparseconvnet/SCN/CUDA/Deconvolution.cu
+83
-0
sparseconvnet/SCN/CUDA/Deconvolution.h
sparseconvnet/SCN/CUDA/Deconvolution.h
+108
-111
sparseconvnet/SCN/CUDA/IOLayers.cu
sparseconvnet/SCN/CUDA/IOLayers.cu
+244
-0
sparseconvnet/SCN/CUDA/IOLayers.h
sparseconvnet/SCN/CUDA/IOLayers.h
+13
-13
sparseconvnet/SCN/CUDA/LeakyReLU.cu
sparseconvnet/SCN/CUDA/LeakyReLU.cu
+29
-0
sparseconvnet/SCN/CUDA/LeakyReLU.h
sparseconvnet/SCN/CUDA/LeakyReLU.h
+4
-4
sparseconvnet/SCN/CUDA/MaxPooling.cu
sparseconvnet/SCN/CUDA/MaxPooling.cu
+103
-0
sparseconvnet/SCN/CUDA/MaxPooling.h
sparseconvnet/SCN/CUDA/MaxPooling.h
+77
-0
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+47
-0
sparseconvnet/SCN/CUDA/RuleBookIterator.h
sparseconvnet/SCN/CUDA/RuleBookIterator.h
+34
-0
sparseconvnet/SCN/CUDA/SparseToDense.cu
sparseconvnet/SCN/CUDA/SparseToDense.cu
+56
-0
sparseconvnet/SCN/CUDA/SparseToDense.h
sparseconvnet/SCN/CUDA/SparseToDense.h
+71
-0
sparseconvnet/SCN/CUDA/UnPooling.cu
sparseconvnet/SCN/CUDA/UnPooling.cu
+54
-0
sparseconvnet/SCN/CUDA/UnPooling.h
sparseconvnet/SCN/CUDA/UnPooling.h
+71
-0
No files found.
sparseconvnet/SCN/CUDA/BatchNormalization.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
template
<
typename
T
>
void
cuda_BatchNormalizationInTensor_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
}
sparseconvnet/SCN/
generic/GPU
/BatchNormalization.h
→
sparseconvnet/SCN/
CUDA
/BatchNormalization.h
View file @
2c4ed608
...
...
@@ -4,9 +4,9 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHNORMALIZATION_H
#define
GPU
_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_BATCHNORMALIZATION_H
#define
CUDA
_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
...
...
@@ -14,22 +14,22 @@
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
...
...
@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
u
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
...
...
@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
...
...
@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
T
*
saveMean
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
...
...
@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
...
...
@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
#undef NTX
#undef NTY
#endif
/*
GPU
_BATCHNORMALIZATION_H */
#endif
/*
CUDA
_BATCHNORMALIZATION_H */
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.cu
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.cu
View file @
2c4ed608
...
...
@@ -4,32 +4,28 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_fp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), output_features.data<T>(), \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
noise
,
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
...
...
@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_bp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), d_input_features.data<T>(), \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
...
...
@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#endif
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.h
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.h
View file @
2c4ed608
...
...
@@ -4,50 +4,50 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
#ifndef
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
u
Int
nActive
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
u
Int
nActive
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#endif
/*
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
#endif
/*
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
sparseconvnet/SCN/CUDA/Convolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
sparseconvnet/SCN/
generic/GPU
/Convolution.h
→
sparseconvnet/SCN/
CUDA
/Convolution.h
View file @
2c4ed608
...
...
@@ -4,47 +4,47 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_CONVOLUTION_H
#define
GPU
_CONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_CONVOLUTION_H
#define
CUDA
_CONVOLUTION_H
template
<
typename
T
>
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
u
Int
nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
Int
output_stride
,
Int
nActive
)
{
__shared__
T
b
[
32
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
for
(
u
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
for
(
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
}
}
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
u
Int
nCOLUMNS
)
{
u
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
for
(
u
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
for
(
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
}
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
u
Int
nCOLUMNS
,
cudaStream_t
stream
)
{
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
if
(
nColumns
/
32
>
0
)
dColumnSum
<<
<
dim3
(
nColumns
/
32
,
32
),
32
,
0
,
stream
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
u
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<
<
dim3
(
1
,
32
),
nColumns
-
o
,
0
,
stream
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
input_stride,
output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
output_nPlanes, output_stride);
\
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
...
...
@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
...
...
@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -392,29 +391,29 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride);
\
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dConvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
...
...
@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
...
...
@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
typename
T
>
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
dConvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
input_stride
,
output_nPlanes
,
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
...
...
@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
#endif
/*
GPU
_CONVOLUTION_H */
#endif
/*
CUDA
_CONVOLUTION_H */
sparseconvnet/SCN/CUDA/Deconvolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
double
flops
=
0
;
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
sparseconvnet/SCN/
generic/GPU
/Deconvolution.h
→
sparseconvnet/SCN/
CUDA
/Deconvolution.h
View file @
2c4ed608
...
...
@@ -4,16 +4,16 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_DECONVOLUTION_H
#define
GPU
_DECONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_DECONVOLUTION_H
#define
CUDA
_DECONVOLUTION_H
#include "Convolution.h"
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
input_stride,
output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
output_nPlanes, output_stride);
\
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
...
...
@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
...
...
@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -359,29 +358,29 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride);
\
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
...
...
@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
...
...
@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
...
...
@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
#endif
/*
GPU
_DECONVOLUTION_H */
#endif
/*
CUDA
_DECONVOLUTION_H */
sparseconvnet/SCN/CUDA/IOLayers.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
Int
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
Int
nPlanes
=
input_features
.
size
(
2
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
2
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
sparseconvnet/SCN/
generic/GPU
/IOLayers.h
→
sparseconvnet/SCN/
CUDA
/IOLayers.h
View file @
2c4ed608
...
...
@@ -4,21 +4,21 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_IOLAYERS_H
#define
GPU
_IOLAYERS_H
#ifndef
CUDA
_IOLAYERS_H
#define
CUDA
_IOLAYERS_H
template
<
typename
T
>
__global__
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
}
}
...
...
@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,
template
<
typename
T
>
__global__
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
d_output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
}
}
}
#endif
/*
GPU
_IOLAYERS_H */
#endif
/*
CUDA
_IOLAYERS_H */
sparseconvnet/SCN/CUDA/LeakyReLU.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template
<
typename
T
>
void
cuda_LeakyReLU_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
n
=
input_features
.
numel
();
LeakyReLU_fp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
n
,
alpha
);
}
template
<
typename
T
>
void
cuda_LeakyReLU_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
n
=
d_input_features
.
numel
();
LeakyReLU_bp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
n
,
alpha
);
}
sparseconvnet/SCN/
generic/GPU
/LeakyReLU.h
→
sparseconvnet/SCN/
CUDA
/LeakyReLU.h
View file @
2c4ed608
...
...
@@ -8,16 +8,16 @@
#define LEAKYRELU_H
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
n
,
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
n
,
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
T
*
d_output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
...
...
sparseconvnet/SCN/CUDA/MaxPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/
generic/GPU
/MaxPooling.h
→
sparseconvnet/SCN/
CUDA
/MaxPooling.h
View file @
2c4ed608
...
...
@@ -4,26 +4,26 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_MAXPOOLING_H
#define
GPU
_MAXPOOLING_H
#ifndef
CUDA
_MAXPOOLING_H
#define
CUDA
_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
...
...
@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
void
cuda_
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
...
...
@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/*
GPU
_MAXPOOLING_H */
#endif
/*
CUDA
_MAXPOOLING_H */
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm>
template
<
typename
T
>
double
cuda_NetworkInNetwork_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
input_nPlanes
=
weight
.
size
(
0
);
auto
output_nPlanes
=
weight
.
size
(
1
);
output_features
.
resize_
({
nActive
,
input_nPlanes
});
if
(
bias
.
numel
())
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
output_features
.
addmm
(
input_features
,
weight
);
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_updateGradInput
(
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
)
{
d_input_features
.
resize_
({(
int
)
d_output_features
.
size
(
0
),
weight
.
size
(
0
)});
d_input_features
.
zero_
();
at
::
mm_out
(
d_input_features
,
d_output_features
,
weight
.
t
());
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_accGradParameters
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
at
::
mm_out
(
d_weight
,
input_features
.
t
(),
d_output_features
);
}
sparseconvnet/SCN/
generic/GPU
/RuleBookIterator.h
→
sparseconvnet/SCN/
CUDA
/RuleBookIterator.h
View file @
2c4ed608
...
...
@@ -4,32 +4,31 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_RULEBOOKITERATOR_H
#define
GPU
_RULEBOOKITERATOR_H
#ifndef
CUDA
_RULEBOOKITERATOR_H
#define
CUDA
_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to
GPU
memory and operating
// Macro to parallelize loading rulebook elements to
CUDA
memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
auto rulesBuffer = THCITensor_(new)(state); \
if (THCITensor_(nElement)(state, rulesBuffer) < ms) \
THCITensor_(resize1d)(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
{ \
Int rbMaxSize = 0; \
for (auto &r : _rules) \
rbMaxSize = std::max(rbMaxSize, (Int)r.size()); \
at::Tensor rulesBuffer = at::CUDA(at_kINT).tensor({rbMaxSize}); \
Int *rbB = rulesBuffer.data<Int>(); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
Int nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(Int) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
X \
} \
Y \
} \
if (nHotB) { \
X \
} \
Y \
} \
THCITensor_(free)(state, rulesBuffer);
}
#endif
/*
GPU
_RULEBOOKITERATOR_H */
#endif
/*
CUDA
_RULEBOOKITERATOR_H */
sparseconvnet/SCN/CUDA/SparseToDense.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
+=
_nPlanes
*
spatialVolume
;)
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
+=
_nPlanes
*
spatialVolume
;)
}
}
sparseconvnet/SCN/
generic/GPU
/SparseToDense.h
→
sparseconvnet/SCN/
CUDA
/SparseToDense.h
View file @
2c4ed608
...
...
@@ -4,28 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
#ifndef CUDA_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
__syncthreads
();
...
...
@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
u
Int
nPlan
es
,
uInt
spatialVolume
,
uInt
*
rules
,
u
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rul
es
,
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
__syncthreads
();
...
...
@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
#endif
/*
GPU
_SPARSETODENSE_H */
#endif
/*
CUDA
_SPARSETODENSE_H */
sparseconvnet/SCN/CUDA/UnPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#include "UnPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/CUDA/UnPooling.h
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_UNPOOLING_H
#define CUDA_UNPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* CUDA_UNPOOLING_H */
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment