Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
2c4ed608
Commit
2c4ed608
authored
Jun 20, 2018
by
Benjamin Thomas Graham
Browse files
Goodbye THNN. Hello ATen!
parent
6d4475db
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1648 additions
and
307 deletions
+1648
-307
sparseconvnet/SCN/CUDA/BatchNormalization.cu
sparseconvnet/SCN/CUDA/BatchNormalization.cu
+94
-0
sparseconvnet/SCN/CUDA/BatchNormalization.h
sparseconvnet/SCN/CUDA/BatchNormalization.h
+32
-35
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+69
-0
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+19
-19
sparseconvnet/SCN/CUDA/Convolution.cu
sparseconvnet/SCN/CUDA/Convolution.cu
+318
-0
sparseconvnet/SCN/CUDA/Convolution.h
sparseconvnet/SCN/CUDA/Convolution.h
+122
-125
sparseconvnet/SCN/CUDA/Deconvolution.cu
sparseconvnet/SCN/CUDA/Deconvolution.cu
+83
-0
sparseconvnet/SCN/CUDA/Deconvolution.h
sparseconvnet/SCN/CUDA/Deconvolution.h
+108
-111
sparseconvnet/SCN/CUDA/IOLayers.cu
sparseconvnet/SCN/CUDA/IOLayers.cu
+244
-0
sparseconvnet/SCN/CUDA/IOLayers.h
sparseconvnet/SCN/CUDA/IOLayers.h
+13
-13
sparseconvnet/SCN/CUDA/LeakyReLU.cu
sparseconvnet/SCN/CUDA/LeakyReLU.cu
+29
-0
sparseconvnet/SCN/CUDA/LeakyReLU.h
sparseconvnet/SCN/CUDA/LeakyReLU.h
+4
-4
sparseconvnet/SCN/CUDA/MaxPooling.cu
sparseconvnet/SCN/CUDA/MaxPooling.cu
+103
-0
sparseconvnet/SCN/CUDA/MaxPooling.h
sparseconvnet/SCN/CUDA/MaxPooling.h
+77
-0
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+47
-0
sparseconvnet/SCN/CUDA/RuleBookIterator.h
sparseconvnet/SCN/CUDA/RuleBookIterator.h
+34
-0
sparseconvnet/SCN/CUDA/SparseToDense.cu
sparseconvnet/SCN/CUDA/SparseToDense.cu
+56
-0
sparseconvnet/SCN/CUDA/SparseToDense.h
sparseconvnet/SCN/CUDA/SparseToDense.h
+71
-0
sparseconvnet/SCN/CUDA/UnPooling.cu
sparseconvnet/SCN/CUDA/UnPooling.cu
+54
-0
sparseconvnet/SCN/CUDA/UnPooling.h
sparseconvnet/SCN/CUDA/UnPooling.h
+71
-0
No files found.
sparseconvnet/SCN/CUDA/BatchNormalization.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
template
<
typename
T
>
void
cuda_BatchNormalizationInTensor_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
}
sparseconvnet/SCN/
generic/GPU
/BatchNormalization.h
→
sparseconvnet/SCN/
CUDA
/BatchNormalization.h
View file @
2c4ed608
...
@@ -4,9 +4,9 @@
...
@@ -4,9 +4,9 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHNORMALIZATION_H
#ifndef
CUDA
_BATCHNORMALIZATION_H
#define
GPU
_BATCHNORMALIZATION_H
#define
CUDA
_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <cassert>
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// input_stride and output_stride are normally the same as nPlanes; allow larger
...
@@ -14,22 +14,22 @@
...
@@ -14,22 +14,22 @@
// NTX ~ 16 - nPlanes must be a multiple of this
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
// NTY ~ 64 - at least 4
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
...
@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
...
@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
__syncthreads
();
T
_saveMean
=
0
;
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
T
_saveInvStd
=
0
;
for
(
u
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
}
...
@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
...
@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
T
W
=
t
[
0
][
threadIdx
.
x
];
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
...
@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
__syncthreads
();
}
}
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
W
[
threadIdx
.
x
]
=
...
@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
...
@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
__syncthreads
();
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
...
@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
}
}
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
if
(
train
)
{
BatchNormalization_f_train
<
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
eps
,
momentum
,
leakiness
);
}
else
{
}
else
{
BatchNormalization_f_test
<
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
eps
,
momentum
,
leakiness
);
}
}
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
T
*
saveMean
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
...
@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
...
@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
__syncthreads
();
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
...
@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
u
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
...
@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
...
@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
}
}
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
BatchNormalization_b
<
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
...
@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
...
@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
#undef NTX
#undef NTX
#undef NTY
#undef NTY
#endif
/*
GPU
_BATCHNORMALIZATION_H */
#endif
/*
CUDA
_BATCHNORMALIZATION_H */
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.cu
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.cu
View file @
2c4ed608
...
@@ -4,32 +4,28 @@
...
@@ -4,32 +4,28 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
BatchwiseMultiplicativeDropout_fp< \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
T, NTX, \
THCState_getCurrentStream(state)>>> \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
(THCTensor_(data)(state, input_features), \
input_features.data<T>(), output_features.data<T>(), \
THCTensor_(data)(state, output_features), \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
return; \
} \
} \
}
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateOutput
)(
template
<
typename
T
>
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
noise
,
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
float
alpha
)
{
if
(
input_features
!=
output_features
)
output_features
.
resize_as_
(
input_features
);
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
.
size
(
1
);
auto
nPlanes
=
input_features
->
size
[
1
];
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
16
,
64
)
...
@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
...
@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
BatchwiseMultiplicativeDropout_bp< \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
T, NTX, \
THCState_getCurrentStream(state)>>> \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
(THCTensor_(data)(state, input_features), \
input_features.data<T>(), d_input_features.data<T>(), \
THCTensor_(data)(state, d_input_features), \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
THCTensor_(data)(state, d_output_features), \
nPlanes, nPlanes, alpha); \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
return; \
} \
} \
}
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateGradInput
)(
template
<
typename
T
>
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
THCTensor
*
d_output_features
,
THCTensor
*
noise
,
float
alpha
)
{
/*cuda float*/
at
::
Tensor
input_features
,
if
(
d_input_features
!=
d_output_features
)
/*cuda float*/
at
::
Tensor
d_input_features
,
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
/*cuda float*/
at
::
Tensor
d_output_features
,
auto
nActive
=
input_features
->
size
[
0
];
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
auto
nPlanes
=
input_features
->
size
[
1
];
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
...
@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
...
@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
SPARSECONVNET_FOO
(
1
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
}
#undef SPARSECONVNET_FOO
#undef SPARSECONVNET_FOO
#endif
sparseconvnet/SCN/
generic/GPU
/BatchwiseMultiplicativeDropout.h
→
sparseconvnet/SCN/
CUDA
/BatchwiseMultiplicativeDropout.h
View file @
2c4ed608
...
@@ -4,50 +4,50 @@
...
@@ -4,50 +4,50 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#ifndef
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
T
*
output_features
,
T
*
noise
,
u
Int
nActive
,
u
Int
nPlanes
,
Int
nActive
,
Int
nPlanes
,
u
Int
input_stride
,
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
}
__syncthreads
();
__syncthreads
();
}
}
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
u
Int
nActive
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
u
Int
output_stride
,
T
alpha
)
{
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
__shared__
T
nz
[
NTX
];
for
(
u
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
__syncthreads
();
for
(
u
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
row
+=
gridDim
.
y
*
NTY
)
{
u
Int
i
=
row
*
input_stride
+
plane
;
Int
i
=
row
*
input_stride
+
plane
;
u
Int
o
=
row
*
output_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
}
__syncthreads
();
__syncthreads
();
}
}
}
}
#endif
/*
GPU
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
#endif
/*
CUDA
_BATCHWISEMULTIPLICATIVEDROPOUT_H */
sparseconvnet/SCN/CUDA/Convolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
sparseconvnet/SCN/
generic/GPU
/Convolution.h
→
sparseconvnet/SCN/
CUDA
/Convolution.h
View file @
2c4ed608
...
@@ -4,47 +4,47 @@
...
@@ -4,47 +4,47 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_CONVOLUTION_H
#ifndef
CUDA
_CONVOLUTION_H
#define
GPU
_CONVOLUTION_H
#define
CUDA
_CONVOLUTION_H
#include "../SparseConvNet.h"
template
<
typename
T
>
template
<
typename
T
>
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
u
Int
nPlanes
,
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
Int
output_stride
,
Int
nActive
)
{
__shared__
T
b
[
32
];
__shared__
T
b
[
32
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
for
(
u
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
for
(
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
}
}
}
}
template
<
typename
T
>
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
u
Int
nCOLUMNS
)
{
Int
nCOLUMNS
)
{
u
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
T
t
=
0
;
for
(
u
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
for
(
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
atomicAdd
(
&
target
[
i
],
t
);
}
}
template
<
typename
T
>
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
u
Int
nRows
,
u
Int
nColumns
,
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
u
Int
nCOLUMNS
,
cudaStream_t
stream
)
{
Int
nCOLUMNS
)
{
if
(
nColumns
/
32
>
0
)
if
(
nColumns
/
32
>
0
)
dColumnSum
<<
<
dim3
(
nColumns
/
32
,
32
),
32
,
0
,
stream
>>>
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
if
(
nColumns
%
32
>
0
)
{
u
Int
o
=
nColumns
/
32
*
32
;
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<
<
dim3
(
1
,
32
),
nColumns
-
o
,
0
,
stream
>>>
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// nHot must be a multiple of K!!
// Input x Weight -> Output
// Input x Weight -> Output
...
@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
inFeatures
+=
K
;
}
}
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
#define FOO(T, K, V) \
{ \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dConvolution_KMxKN_forwardA<
T, K, V> << <
\
dConvolution_KMxKN_forwardA<
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
, 0, stream>>>
\
dim3(K, K / V)
>>>(
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
input_stride,
output_nPlanes, output_stride); \
output_nPlanes, output_stride);
\
if (nHot > o) \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K), \
dConvolution_KMxKN_forwardB< \
dim3(K, K / V), 0, stream>>> \
T, K, \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
...
@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
}
template
<
>
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
FOO
(
double
,
8
,
2
)
...
@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
...
@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
...
@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
...
@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
@@ -392,29 +391,29 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -392,29 +391,29 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#define FOO(T, K, V) \
#define FOO(T, K, V) \
{ \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dConvolution_KMxKN_backward_dW_A<
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
, 0, stream>>>
\
dim3(K, K / V)
>>>(
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride);
\
if (nHot > o) \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> << < \
dConvolution_KMxKN_backward_dW_B< \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
T, K, \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
output_stride); \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
FOO
(
T
,
8
,
2
)
...
@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
}
#undef FOO
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
// Read w
#pragma unroll
#pragma unroll
...
@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
// Read rules for K input/output pairs
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dConvolution_KMxKN_forward2
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
return
;
return
;
}
else
{
}
else
{
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
input_stride
,
output_nPlanes
,
output_stride
);
}
}
}
}
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
u
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dI
[
V
];
T
dW
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
// Read w, reset dW
#pragma unroll
#pragma unroll
...
@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
return
;
}
else
{
}
else
{
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
}
}
#endif
/*
GPU
_CONVOLUTION_H */
#endif
/*
CUDA
_CONVOLUTION_H */
sparseconvnet/SCN/CUDA/Deconvolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
double
flops
=
0
;
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
sparseconvnet/SCN/
generic/GPU
/Deconvolution.h
→
sparseconvnet/SCN/
CUDA
/Deconvolution.h
View file @
2c4ed608
...
@@ -4,16 +4,16 @@
...
@@ -4,16 +4,16 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_DECONVOLUTION_H
#ifndef
CUDA
_DECONVOLUTION_H
#define
GPU
_DECONVOLUTION_H
#define
CUDA
_DECONVOLUTION_H
#include "../SparseConvNet.h"
#include "Convolution.h"
#include "Convolution.h"
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// nHot must be a multiple of K!!
// Input x Weight -> Output
// Input x Weight -> Output
...
@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures
+=
K
;
inFeatures
+=
K
;
}
}
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
#define FOO(T, K, V) \
{ \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<
T, K, V> << <
\
dDeconvolution_KMxKN_forwardA<
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
, 0, stream>>>
\
dim3(K, K / V)
>>>(
\
(
inFeatures, outFeatures, w, rules, o, input_nPlanes,
\
inFeatures, outFeatures, w, rules, o, input_nPlanes,
input_stride,
\
input_stride,
output_nPlanes, output_stride); \
output_nPlanes, output_stride);
\
if (nHot > o) \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dDeconvolution_KMxKN_forwardB< \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
T, K, \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
...
@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
}
template
<
>
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
uInt
*
rules
,
uInt
nHot
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
FOO
(
double
,
8
,
2
)
...
@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
...
@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
...
@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
...
@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
...
@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
...
@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
...
@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
...
@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
u
Int
R0
[
V
];
Int
R0
[
V
];
u
Int
R1
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
...
@@ -359,29 +358,29 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -359,29 +358,29 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
#define FOO(T, K, V) \
#define FOO(T, K, V) \
{ \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<
T, K, V> << <
\
dDeconvolution_KMxKN_backward_dW_A<
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
, 0, stream>>>
\
dim3(K, K / V)
>>>(
\
(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,
\
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride);
\
if (nHot > o) \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dDeconvolution_KMxKN_backward_dW_B< \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
T, K, \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
output_stride); \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
FOO
(
T
,
8
,
2
)
...
@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
}
#undef FOO
#undef FOO
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
u
Int
*
rules
,
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
// - parallel over N,nHot - loop over M
u
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
// N = gridDim.y ~ output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
// Read w
#pragma unroll
#pragma unroll
...
@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
// Read rules for K input/output pairs
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
...
@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
return
;
return
;
}
else
{
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_stride
);
}
}
}
}
// dOutput x W^T -> dInput and
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
u
Int
*
rules
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
u
Int
nHot
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
u
Int
output_stride
)
{
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
u
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dI
[
V
];
T
dW
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
u
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
int
ty
[
V
];
#pragma unroll
#pragma unroll
...
@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
...
@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
u
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
// Read w, reset dW
#pragma unroll
#pragma unroll
...
@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
...
@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
...
@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
,
0
,
stream
>>>
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
return
;
}
else
{
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
output_nPlanes
,
output_stride
);
}
}
}
}
#endif
/*
GPU
_DECONVOLUTION_H */
#endif
/*
CUDA
_DECONVOLUTION_H */
sparseconvnet/SCN/CUDA/IOLayers.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
Int
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
Int
nPlanes
=
input_features
.
size
(
2
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
2
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
sparseconvnet/SCN/
generic/GPU
/IOLayers.h
→
sparseconvnet/SCN/
CUDA
/IOLayers.h
View file @
2c4ed608
...
@@ -4,21 +4,21 @@
...
@@ -4,21 +4,21 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_IOLAYERS_H
#ifndef
CUDA
_IOLAYERS_H
#define
GPU
_IOLAYERS_H
#define
CUDA
_IOLAYERS_H
template
<
typename
T
>
template
<
typename
T
>
__global__
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
__global__
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
output_features
+
row
*
nPlanes
;
T
*
out
=
output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
}
}
}
}
...
@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,
...
@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,
template
<
typename
T
>
template
<
typename
T
>
__global__
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
__global__
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
d_output_features
+
row
*
nPlanes
;
T
*
out
=
d_output_features
+
row
*
nPlanes
;
u
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
u
Int
nActive
=
r
[
0
];
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
}
}
}
}
}
}
#endif
/*
GPU
_IOLAYERS_H */
#endif
/*
CUDA
_IOLAYERS_H */
sparseconvnet/SCN/CUDA/LeakyReLU.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template
<
typename
T
>
void
cuda_LeakyReLU_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
n
=
input_features
.
numel
();
LeakyReLU_fp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
n
,
alpha
);
}
template
<
typename
T
>
void
cuda_LeakyReLU_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
n
=
d_input_features
.
numel
();
LeakyReLU_bp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
n
,
alpha
);
}
sparseconvnet/SCN/
generic/GPU
/LeakyReLU.h
→
sparseconvnet/SCN/
CUDA
/LeakyReLU.h
View file @
2c4ed608
...
@@ -8,16 +8,16 @@
...
@@ -8,16 +8,16 @@
#define LEAKYRELU_H
#define LEAKYRELU_H
template
<
typename
T
>
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
n
,
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
:
(
input_features
[
i
]
*
alpha
);
}
}
template
<
typename
T
>
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
n
,
T
alpha
)
{
T
*
d_output_features
,
Int
n
,
T
alpha
)
{
for
(
u
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
:
(
d_output_features
[
i
]
*
alpha
);
...
...
sparseconvnet/SCN/CUDA/MaxPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/
generic/GPU
/MaxPooling.h
→
sparseconvnet/SCN/
CUDA
/MaxPooling.h
View file @
2c4ed608
...
@@ -4,26 +4,26 @@
...
@@ -4,26 +4,26 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_MAXPOOLING_H
#ifndef
CUDA
_MAXPOOLING_H
#define
GPU
_MAXPOOLING_H
#define
CUDA
_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
Int
*
rules
,
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
__shared__
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
}
__syncthreads
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
output_features
[
o
+
plane
]
=
inp
;
...
@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
...
@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
void
cuda_
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
MaxPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
rules
,
nHot
);
}
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
uInt
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
Int
*
rules
,
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
__shared__
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
}
__syncthreads
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
...
@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
...
@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
output_features
,
T
*
d_output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
Int
nPlanes
,
Int
input_stride
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
uInt
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
MaxPooling_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
input_features
,
d_input_features
,
output_features
,
d_output_features
,
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
}
#endif
/*
GPU
_MAXPOOLING_H */
#endif
/*
CUDA
_MAXPOOLING_H */
sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm>
template
<
typename
T
>
double
cuda_NetworkInNetwork_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
input_nPlanes
=
weight
.
size
(
0
);
auto
output_nPlanes
=
weight
.
size
(
1
);
output_features
.
resize_
({
nActive
,
input_nPlanes
});
if
(
bias
.
numel
())
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
output_features
.
addmm
(
input_features
,
weight
);
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_updateGradInput
(
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
)
{
d_input_features
.
resize_
({(
int
)
d_output_features
.
size
(
0
),
weight
.
size
(
0
)});
d_input_features
.
zero_
();
at
::
mm_out
(
d_input_features
,
d_output_features
,
weight
.
t
());
}
template
<
typename
T
>
void
cuda_NetworkInNetwork_accGradParameters
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
at
::
mm_out
(
d_weight
,
input_features
.
t
(),
d_output_features
);
}
sparseconvnet/SCN/
generic/GPU
/RuleBookIterator.h
→
sparseconvnet/SCN/
CUDA
/RuleBookIterator.h
View file @
2c4ed608
...
@@ -4,32 +4,31 @@
...
@@ -4,32 +4,31 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_RULEBOOKITERATOR_H
#ifndef
CUDA
_RULEBOOKITERATOR_H
#define
GPU
_RULEBOOKITERATOR_H
#define
CUDA
_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to
GPU
memory and operating
// Macro to parallelize loading rulebook elements to
CUDA
memory and operating
// on the elements of the rulebook.
// on the elements of the rulebook.
// X is the function to apply.
// X is the function to apply.
// Y is a command to run
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
{ \
auto rulesBuffer = THCITensor_(new)(state); \
Int rbMaxSize = 0; \
if (THCITensor_(nElement)(state, rulesBuffer) < ms) \
for (auto &r : _rules) \
THCITensor_(resize1d)(state, rulesBuffer, ms); \
rbMaxSize = std::max(rbMaxSize, (Int)r.size()); \
uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer); \
at::Tensor rulesBuffer = at::CUDA(at_kINT).tensor({rbMaxSize}); \
for (int k = 0; k < _rules.size(); ++k) { \
Int *rbB = rulesBuffer.data<Int>(); \
auto &r = _rules[k]; \
for (int k = 0; k < _rules.size(); ++k) { \
uInt nHotB = r.size() / 2; \
auto &r = _rules[k]; \
if (nHotB) { \
Int nHotB = r.size() / 2; \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
if (nHotB) { \
cudaMemcpyHostToDevice); \
cudaMemcpy(rbB, &r[0], sizeof(Int) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
X \
} \
Y \
} \
} \
if (nHotB) { \
}
X \
} \
Y \
} \
THCITensor_(free)(state, rulesBuffer);
#endif
/*
GPU
_RULEBOOKITERATOR_H */
#endif
/*
CUDA
_RULEBOOKITERATOR_H */
sparseconvnet/SCN/CUDA/SparseToDense.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
+=
_nPlanes
*
spatialVolume
;)
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
+=
_nPlanes
*
spatialVolume
;)
}
}
sparseconvnet/SCN/
generic/GPU
/SparseToDense.h
→
sparseconvnet/SCN/
CUDA
/SparseToDense.h
View file @
2c4ed608
...
@@ -4,28 +4,27 @@
...
@@ -4,28 +4,27 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#ifndef CUDA_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
// NTX must be >=2 so r is filled properly
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
u
Int
nHot
)
{
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
__shared__
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
}
__syncthreads
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
__syncthreads
();
__syncthreads
();
...
@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
...
@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
T
*
output_features
,
u
Int
nPlan
es
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rul
es
,
uInt
spatialVolume
,
uInt
*
rules
,
u
Int
nHot
)
{
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
}
// NTX must be >=2 so r is filled properly
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
u
Int
nHot
)
{
Int
nHot
)
{
__shared__
u
Int
r
[
NTY
*
2
];
__shared__
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
}
__syncthreads
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
__syncthreads
();
__syncthreads
();
...
@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
...
@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
uInt
spatialVolume
,
uInt
*
rules
,
uInt
nHot
)
{
Int
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
>
<<
<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
nHot
);
}
}
#endif
/*
GPU
_SPARSETODENSE_H */
#endif
/*
CUDA
_SPARSETODENSE_H */
sparseconvnet/SCN/CUDA/UnPooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#include "UnPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_UnPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_UnPooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
}
sparseconvnet/SCN/CUDA/UnPooling.h
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_UNPOOLING_H
#define CUDA_UNPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
UnPooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
+
1
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
UnPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* CUDA_UNPOOLING_H */
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment