Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
2c4ed608
Commit
2c4ed608
authored
Jun 20, 2018
by
Benjamin Thomas Graham
Browse files
Goodbye THNN. Hello ATen!
parent
6d4475db
Changes
145
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1648 additions
and
307 deletions
+1648
-307
sparseconvnet/SCN/CUDA/BatchNormalization.cu
sparseconvnet/SCN/CUDA/BatchNormalization.cu
+94
-0
sparseconvnet/SCN/CUDA/BatchNormalization.h
sparseconvnet/SCN/CUDA/BatchNormalization.h
+32
-35
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+69
-0
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+19
-19
sparseconvnet/SCN/CUDA/Convolution.cu
sparseconvnet/SCN/CUDA/Convolution.cu
+318
-0
sparseconvnet/SCN/CUDA/Convolution.h
sparseconvnet/SCN/CUDA/Convolution.h
+122
-125
sparseconvnet/SCN/CUDA/Deconvolution.cu
sparseconvnet/SCN/CUDA/Deconvolution.cu
+83
-0
sparseconvnet/SCN/CUDA/Deconvolution.h
sparseconvnet/SCN/CUDA/Deconvolution.h
+108
-111
sparseconvnet/SCN/CUDA/IOLayers.cu
sparseconvnet/SCN/CUDA/IOLayers.cu
+244
-0
sparseconvnet/SCN/CUDA/IOLayers.h
sparseconvnet/SCN/CUDA/IOLayers.h
+13
-13
sparseconvnet/SCN/CUDA/LeakyReLU.cu
sparseconvnet/SCN/CUDA/LeakyReLU.cu
+29
-0
sparseconvnet/SCN/CUDA/LeakyReLU.h
sparseconvnet/SCN/CUDA/LeakyReLU.h
+4
-4
sparseconvnet/SCN/CUDA/MaxPooling.cu
sparseconvnet/SCN/CUDA/MaxPooling.cu
+103
-0
sparseconvnet/SCN/CUDA/MaxPooling.h
sparseconvnet/SCN/CUDA/MaxPooling.h
+77
-0
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+47
-0
sparseconvnet/SCN/CUDA/RuleBookIterator.h
sparseconvnet/SCN/CUDA/RuleBookIterator.h
+34
-0
sparseconvnet/SCN/CUDA/SparseToDense.cu
sparseconvnet/SCN/CUDA/SparseToDense.cu
+56
-0
sparseconvnet/SCN/CUDA/SparseToDense.h
sparseconvnet/SCN/CUDA/SparseToDense.h
+71
-0
sparseconvnet/SCN/CUDA/UnPooling.cu
sparseconvnet/SCN/CUDA/UnPooling.cu
+54
-0
sparseconvnet/SCN/CUDA/UnPooling.h
sparseconvnet/SCN/CUDA/UnPooling.h
+71
-0
No files found.
sparseconvnet/SCN/CUDA/BatchNormalization.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
template
<
typename
T
>
void
cuda_BatchNormalizationInTensor_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
}
sparseconvnet/SCN/
generic/GPU
/BatchNormalization.h
→
sparseconvnet/SCN/
CUDA
/BatchNormalization.h
View file @
2c4ed608
...
...
@@ -4,9 +4,9 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHNORMALIZATION_H
#define
GPU
_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_BATCHNORMALIZATION_H
#define
CUDA
_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
...
...
@@ -14,22 +14,22 @@
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
...
...
@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
u
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
...
...
@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
...
...
@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
T
*
saveMean
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
...
...
@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
...
@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
...
...
@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
#undef NTX
#undef NTY
#endif
/*
GPU
_BATCHNORMALIZATION_H */
#endif
/*
CUDA
_BATCHNORMALIZATION_H */
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.cu
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.cu
View file @
2c4ed608
...
...
@@ -4,32 +4,28 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_fp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), output_features.data<T>(), \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
noise
,
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
...
...
@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_bp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), d_input_features.data<T>(), \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
...
...
@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#endif
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.h
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.h
View file @
2c4ed608
...
...
@@ -4,50 +4,50 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
#ifndef
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
u
Int
nActive
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
u
Int
nActive
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#endif
/*
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
#endif
/*
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
sparseconvnet/SCN/CUDA/Convolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
sparseconvnet/SCN/
generic/GPU
/Convolution.h
→
sparseconvnet/SCN/
CUDA
/Convolution.h
View file @
2c4ed608
...
...
@@ -4,47 +4,47 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_CONVOLUTION_H
#define
GPU
_CONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_CONVOLUTION_H
#define
CUDA
_CONVOLUTION_H
template
<
typename
T
>
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
u
Int
nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
Int
output_stride
,
Int
nActive
)
{
__shared__
T
b
[
32
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
for
(
u
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
for
(
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
}
}
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
u
Int
nCOLUMNS
)
{
u
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
for
(
u
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
for
(
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
}
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
u
Int
nCOLUMNS
,
cudaStream_t
stream
)
{
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
if
(
nColumns
/
32
>
0
)
dColumnSum
<<
<
dim3
(
nColumns
/
32
,
32
),
32
,
0
,
stream
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
u
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<
<
dim3
(
1
,
32
),
nColumns
-
o
,
0
,
stream
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
input_stride,
output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
output_nPlanes, output_stride);
\
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
dConvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
...
...
@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
...
...
@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -392,17 +391,18 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
dConvolution_KMxKN_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
dConvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
...
...
@@ -411,10 +411,9 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
...
...
@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
...
...
@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
typename
T
>
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
dConvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
input_stride
,
output_nPlanes
,
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
...
...
@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
#endif
/*
GPU
_CONVOLUTION_H */
#endif
/*
CUDA
_CONVOLUTION_H */
sparseconvnet/SCN/CUDA/Deconvolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
double
flops
=
0
;
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
sparseconvnet/SCN/
generic/GPU
/Deconvolution.h
→
sparseconvnet/SCN/
CUDA
/Deconvolution.h
View file @
2c4ed608
...
...
@@ -4,16 +4,16 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_DECONVOLUTION_H
#define
GPU
_DECONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef
CUDA
_DECONVOLUTION_H
#define
CUDA
_DECONVOLUTION_H
#include "Convolution.h"
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
input_stride,
output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
output_nPlanes, output_stride);
\
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
dDeconvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
...
...
@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
...
...
@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
...
@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
...
@@ -359,17 +358,18 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, stream>>>
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
dDeconvolution_KMxKN_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
...
...
@@ -378,10 +378,9 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
...
...
@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
...
...
@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
...
...
@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
...
...
@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
dW
[
v
]
=
0
;
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
#endif
/*
GPU
_DECONVOLUTION_H */
#endif
/*
CUDA
_DECONVOLUTION_H */
sparseconvnet/SCN/CUDA/IOLayers.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
Int
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
Int
nPlanes
=
input_features
.
size
(
2
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
2
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
sparseconvnet/SCN/
generic/GPU
/IOLayers.h
→
sparseconvnet/SCN/
CUDA
/IOLayers.h
View file @
2c4ed608
...
...
@@ -4,21 +4,21 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_IOLAYERS_H
#define
GPU
_IOLAYERS_H
#ifndef
CUDA
_IOLAYERS_H
#define
CUDA
_IOLAYERS_H
template
<
typename
T
>
__global__
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
}
}
...
...
@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,
template
<
typename
T
>
__global__
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
d_output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
}
}
}
#endif
/*
GPU
_IOLAYERS_H */
#endif
/*
CUDA
_IOLAYERS_H */
sparseconvnet/SCN/CUDA/LeakyReLU.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template
<
typename
T
>
void
cuda_LeakyReLU_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
n
=
input_features
.
numel
();
LeakyReLU_fp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
n
,
alpha
);
}
template
<
typename
T
>
void
cuda_LeakyReLU_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
n
=
d_input_features
.
numel
();
LeakyReLU_bp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
n
,
alpha
);
}
sparseconvnet/SCN/
generic/GPU
/LeakyReLU.h
→
sparseconvnet/SCN/
CUDA
/LeakyReLU.h
View file @
2c4ed608
...
...
@@ -8,16 +8,16 @@
#define LEAKYRELU_H
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
n
,
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
n
,
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
T
*
d_output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
...
...
sparseconvnet/SCN/CUDA/MaxPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/
generic/GPU
/MaxPooling.h
→
sparseconvnet/SCN/
CUDA
/MaxPooling.h
View file @
2c4ed608
...
...
@@ -4,26 +4,26 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_MAXPOOLING_H
#define
GPU
_MAXPOOLING_H
#ifndef
CUDA
_MAXPOOLING_H
#define
CUDA
_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
...
...
@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
void
cuda_
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
...
...
@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/*
GPU
_MAXPOOLING_H */
#endif
/*
CUDA
_MAXPOOLING_H */
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm>
template
<
typename
T
>
double
cuda_NetworkInNetwork_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
input_nPlanes
=
weight
.
size
(
0
);
auto
output_nPlanes
=
weight
.
size
(
1
);
output_features
.
resize_
({
nActive
,
input_nPlanes
});
if
(
bias
.
numel
())
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
output_features
.
addmm
(
input_features
,
weight
);
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_updateGradInput
(
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
)
{
d_input_features
.
resize_
({(
int
)
d_output_features
.
size
(
0
),
weight
.
size
(
0
)});
d_input_features
.
zero_
();
at
::
mm_out
(
d_input_features
,
d_output_features
,
weight
.
t
());
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_accGradParameters
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
at
::
mm_out
(
d_weight
,
input_features
.
t
(),
d_output_features
);
}
sparseconvnet/SCN/
generic/GPU
/RuleBookIterator.h
→
sparseconvnet/SCN/
CUDA
/RuleBookIterator.h
View file @
2c4ed608
...
...
@@ -4,32 +4,31 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_RULEBOOKITERATOR_H
#define
GPU
_RULEBOOKITERATOR_H
#ifndef
CUDA
_RULEBOOKITERATOR_H
#define
CUDA
_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to
GPU
memory and operating
// Macro to parallelize loading rulebook elements to
CUDA
memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
auto rulesBuffer = THCITensor_(new)(state); \
if (THCITensor_(nElement)(state, rulesBuffer) < ms) \
THCITensor_(resize1d)(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer); \
{ \
Int rbMaxSize = 0; \
for (auto &r : _rules) \
rbMaxSize = std::max(rbMaxSize, (Int)r.size()); \
at::Tensor rulesBuffer = at::CUDA(at_kINT).tensor({rbMaxSize}); \
Int *rbB = rulesBuffer.data<Int>(); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
u
Int nHotB = r.size() / 2;
\
Int nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(
u
Int) * 2 * nHotB,
\
cudaMemcpy(rbB, &r[0], sizeof(Int) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
} \
if (nHotB) { \
X \
} \
Y \
} \
THCITensor_(free)(state, rulesBuffer);
}
#endif
/*
GPU
_RULEBOOKITERATOR_H */
#endif
/*
CUDA
_RULEBOOKITERATOR_H */
sparseconvnet/SCN/CUDA/SparseToDense.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
+=
_nPlanes
*
spatialVolume
;)
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
+=
_nPlanes
*
spatialVolume
;)
}
}
sparseconvnet/SCN/
generic/GPU
/SparseToDense.h
→
sparseconvnet/SCN/
CUDA
/SparseToDense.h
View file @
2c4ed608
...
...
@@ -4,28 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
#ifndef CUDA_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
__syncthreads
();
...
...
@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
u
Int
nPlan
es
,
uInt
spatialVolume
,
uInt
*
rules
,
u
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rul
es
,
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
u
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
__syncthreads
();
...
...
@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
#endif
/*
GPU
_SPARSETODENSE_H */
#endif
/*
CUDA
_SPARSETODENSE_H */
sparseconvnet/SCN/CUDA/UnPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#include "UnPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/CUDA/UnPooling.h
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_UNPOOLING_H
#define CUDA_UNPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* CUDA_UNPOOLING_H */
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment