Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
de3743f6
Commit
de3743f6
authored
Jul 13, 2018
by
Benjamin Thomas Graham
Browse files
Factor out CUDA code
parent
f0407b36
Changes
96
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2180 additions
and
2218 deletions
+2180
-2218
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+0
-53
sparseconvnet/SCN/CUDA/Convolution.cpp
sparseconvnet/SCN/CUDA/Convolution.cpp
+289
-0
sparseconvnet/SCN/CUDA/Convolution.cu
sparseconvnet/SCN/CUDA/Convolution.cu
+603
-269
sparseconvnet/SCN/CUDA/Convolution.h
sparseconvnet/SCN/CUDA/Convolution.h
+0
-632
sparseconvnet/SCN/CUDA/Deconvolution.cpp
sparseconvnet/SCN/CUDA/Deconvolution.cpp
+84
-0
sparseconvnet/SCN/CUDA/Deconvolution.cu
sparseconvnet/SCN/CUDA/Deconvolution.cu
+589
-69
sparseconvnet/SCN/CUDA/Deconvolution.h
sparseconvnet/SCN/CUDA/Deconvolution.h
+0
-599
sparseconvnet/SCN/CUDA/IOLayers.cpp
sparseconvnet/SCN/CUDA/IOLayers.cpp
+226
-0
sparseconvnet/SCN/CUDA/IOLayers.cu
sparseconvnet/SCN/CUDA/IOLayers.cu
+54
-228
sparseconvnet/SCN/CUDA/IOLayers.h
sparseconvnet/SCN/CUDA/IOLayers.h
+0
-43
sparseconvnet/SCN/CUDA/LeakyReLU.cpp
sparseconvnet/SCN/CUDA/LeakyReLU.cpp
+32
-0
sparseconvnet/SCN/CUDA/LeakyReLU.cu
sparseconvnet/SCN/CUDA/LeakyReLU.cu
+21
-19
sparseconvnet/SCN/CUDA/LeakyReLU.h
sparseconvnet/SCN/CUDA/LeakyReLU.h
+0
-25
sparseconvnet/SCN/CUDA/MaxPooling.cpp
sparseconvnet/SCN/CUDA/MaxPooling.cpp
+104
-0
sparseconvnet/SCN/CUDA/MaxPooling.cu
sparseconvnet/SCN/CUDA/MaxPooling.cu
+61
-87
sparseconvnet/SCN/CUDA/MaxPooling.h
sparseconvnet/SCN/CUDA/MaxPooling.h
+0
-77
sparseconvnet/SCN/CUDA/NetworkInNetwork.cpp
sparseconvnet/SCN/CUDA/NetworkInNetwork.cpp
+0
-2
sparseconvnet/SCN/CUDA/SparseToDense.cpp
sparseconvnet/SCN/CUDA/SparseToDense.cpp
+60
-0
sparseconvnet/SCN/CUDA/SparseToDense.cu
sparseconvnet/SCN/CUDA/SparseToDense.cu
+57
-44
sparseconvnet/SCN/CUDA/SparseToDense.h
sparseconvnet/SCN/CUDA/SparseToDense.h
+0
-71
No files found.
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#endif
/* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
sparseconvnet/SCN/CUDA/Convolution.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
Convolution_fp_bias
(
T
*
of
,
T
*
b
,
Int
nPlanes
,
Int
nActiveOut
);
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
);
template
<
typename
T
>
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
template
<
typename
T
,
Int
Dimension
>
double
cuda_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
output_features
.
resize_
({
nActiveOut
,
op
});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
Convolution_fp_bias
(
oF
,
bias
.
data
<
T
>
(),
op
,
nActiveOut
);
else
output_features
.
zero_
();
return
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
_rules
,
ip
,
ip
,
op
,
op
);
}
else
{
return
0
;
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveIn
=
m
.
getNActive
(
inputSize
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
d_input_features
.
resize_
({
nActiveIn
,
ip
});
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
if
(
nActive
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
output_features
.
resize_
({
nActive
,
op
});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
Convolution_fp_bias
(
oF
,
bias
.
data
<
T
>
(),
op
,
nActive
);
else
output_features
.
zero_
();
return
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
_rules
,
ip
,
ip
,
op
,
op
);
}
else
{
return
0
;
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
if
(
nActive
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
d_input_features
.
resize_
({
nActive
,
ip
});
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActiveOut
=
mOut
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
output_features
.
resize_
({
nActiveOut
,
op
});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
Convolution_fp_bias
(
oF
,
bias
.
data
<
T
>
(),
op
,
nActiveOut
);
else
output_features
.
zero_
();
return
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
_rules
,
ip
,
ip
,
op
,
op
);
}
else
{
return
0
;
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActiveIn
=
mIn
.
getNActive
(
inputSize
);
Int
nActiveOut
=
mOut
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
d_input_features
.
resize_
({
nActiveIn
,
ip
});
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
output_features
.
resize_
({
nActiveOut
,
op
});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
Convolution_fp_bias
(
oF
,
bias
.
data
<
T
>
(),
op
,
nActiveOut
);
else
output_features
.
zero_
();
return
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
_rules
,
ip
,
ip
,
op
,
op
);
}
else
{
return
0
;
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveIn
=
m
.
getNActive
(
inputSize
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
d_input_features
.
resize_
({
nActiveIn
,
ip
});
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
}
}
}
sparseconvnet/SCN/CUDA/Convolution.cu
View file @
de3743f6
...
...
@@ -4,315 +4,649 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
template
<
typename
T
>
__global__
void
Convolution_fp_bias_
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
Int
nActive
)
{
Int
n
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
b
=
bias
[
n
];
output_features
+=
n
;
for
(
Int
row
=
blockIdx
.
y
;
row
<
nActive
;
row
+=
gridDim
.
y
)
{
output_features
[
row
*
nPlanes
]
=
b
;
}
}
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
template
<
typename
T
>
void
Convolution_fp_bias
(
T
*
oF
,
T
*
b
,
Int
nPlanes
,
Int
nActive
)
{
if
(
nPlanes
/
32
>
0
)
Convolution_fp_bias_
<<<
dim3
(
nPlanes
/
32
,
4096
),
32
>>>
(
oF
,
b
,
nPlanes
,
nActive
);
if
(
nPlanes
%
32
>
0
)
{
Int
o
=
nPlanes
/
32
*
32
;
Convolution_fp_bias_
<<<
dim3
(
1
,
4096
),
nPlanes
-
o
>>>
(
oF
+
o
,
b
+
o
,
nPlanes
,
nActive
);
}
}
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
for
(
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
}
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
if
(
nColumns
/
32
>
0
)
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
template
<
typename
T
,
Int
Dimension
>
double
cuda_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
,
w
+=
c
;
dw
+=
c
;)
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
Dimension
>
double
cuda_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
double
flops
=
0
;
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
Int
c
=
ip
*
op
;
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
double
flops
=
0
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
(
dConvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
else
{
RULEBOOKITERATOR
(
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
)
{
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
(
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
dw
+=
c
;)
}
else
{
RULEBOOKITERATOR
(
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
}
sparseconvnet/SCN/CUDA/Convolution.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_CONVOLUTION_H
#define CUDA_CONVOLUTION_H
template
<
typename
T
>
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
Int
output_stride
,
Int
nActive
)
{
__shared__
T
b
[
32
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
for
(
Int
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
}
}
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
for
(
Int
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
}
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
Int
nCOLUMNS
)
{
if
(
nColumns
/
32
>
0
)
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
}
#endif
/* CUDA_CONVOLUTION_H */
sparseconvnet/SCN/CUDA/Deconvolution.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
double
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
template
<
typename
T
,
Int
Dimension
>
double
cuda_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
output_features
.
resize_
({
nActiveOut
,
op
});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
if
(
bias
.
numel
())
Convolution_fp_bias
(
oF
,
bias
.
data
<
T
>
(),
op
,
nActiveOut
);
else
output_features
.
zero_
();
return
dDeconvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
_rules
,
ip
,
ip
,
op
,
op
);
}
else
{
return
0
;
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActiveIn
=
m
.
getNActive
(
inputSize
);
Int
nActiveOut
=
m
.
getNActive
(
outputSize
);
if
(
nActiveOut
)
{
Int
ip
=
weight
.
size
(
1
);
Int
op
=
weight
.
size
(
2
);
d_input_features
.
resize_
({
nActiveIn
,
ip
});
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
}
}
}
sparseconvnet/SCN/CUDA/Deconvolution.cu
View file @
de3743f6
...
...
@@ -4,80 +4,600 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template
<
typename
T
,
Int
Dimension
>
double
cuda_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
not
bias
.
numel
())
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
double
flops
=
0
;
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
bias
.
numel
())
{
auto
b
=
bias
.
data
<
T
>
();
for
(
Int
i
=
0
;
i
<
op
;
i
+=
32
)
{
Int
blockDim
=
min
((
Int
)
32
,
op
-
i
);
Int
gridDim
=
min
((
Int
)
4096
,
nActive
);
Convolution_fp_bias
<<<
gridDim
,
blockDim
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
Int
c
=
ip
*
op
;
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
double
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
double
flops
=
0
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
dDeconvolution_forward2
<
T
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
(
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
else
{
RULEBOOKITERATOR
(
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
ip
=
input_features
.
size
(
1
);
Int
op
=
d_output_features
.
size
(
1
);
auto
w
=
weight
.
data
<
T
>
();
auto
dw
=
d_weight
.
data
<
T
>
();
Int
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
);
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
(
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
dw
+=
c
;)
}
else
{
RULEBOOKITERATOR
(
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
}
}
sparseconvnet/SCN/CUDA/Deconvolution.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_DECONVOLUTION_H
#define CUDA_DECONVOLUTION_H
#include "Convolution.h"
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
Int
R0
[
V
];
Int
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
Int
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_forward2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
}
#endif
/* CUDA_DECONVOLUTION_H */
sparseconvnet/SCN/CUDA/IOLayers.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
);
template
<
typename
T
>
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
Int
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_fp
<
T
>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_bp
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_bp
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_fp
<
T
>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
Int
nPlanes
=
input_features
.
size
(
2
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_fp
<
T
>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_bp
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_bp
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
2
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
InputLayer_fp
<
T
>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
rb
,
false
);
}
}
sparseconvnet/SCN/CUDA/IOLayers.cu
View file @
de3743f6
...
...
@@ -4,241 +4,67 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
// Rulebook Format
// rules[0][0] == mode
// rules[0][1] == maxActive per spatial location (==1 for modes 0,1,2)
// rules[0][2] == nInputRows
// rules[0][3] == nOutputRows
// rules[1] nOutputRows x (1+maxActive)
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
Int
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
template
<
typename
T
>
__global__
void
InputLayer_fp_
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
output_features
+
row
*
nPlanes
;
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
Int
nPlanes
=
input_features
.
size
(
2
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
template
<
typename
T
>
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
)
{
cudaMemcpy
(
rules_gpu
,
rules_cpu
,
sizeof
(
Int
)
*
nRows
*
(
1
+
maxActive
),
cudaMemcpyHostToDevice
);
InputLayer_fp
<
InputLayer_fp
_
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
iF
,
oF
,
nRows
,
maxActive
,
nPlanes
,
r
b
,
mode
==
4
);
}
input_features
,
output_features
,
nRows
,
maxActive
,
nPlanes
,
r
ules_gpu
,
average
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
1
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
diF
,
doF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
mode
==
4
);
template
<
typename
T
>
__global__
void
InputLayer_bp_
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
d_output_features
+
row
*
nPlanes
;
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
cudaMemcpyHostToDevice
);
InputLayer_bp
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
oF
,
iF
,
nRows
,
maxActive
,
nPlanes
,
rb
,
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
Int
nPlanes
=
d_output_features
.
size
(
2
);
Int
mode
=
rules
[
0
][
0
];
Int
maxActive
=
rules
[
0
][
1
];
Int
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({(
int
)
rules
[
1
].
size
()});
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
cudaMemcpy
(
rb
,
&
rules
[
1
][
0
],
sizeof
(
Int
)
*
rules
[
1
].
size
(),
template
<
typename
T
>
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
)
{
cudaMemcpy
(
rules_gpu
,
rules_cpu
,
sizeof
(
Int
)
*
nRows
*
(
1
+
maxActive
),
cudaMemcpyHostToDevice
);
InputLayer_
fp
<
InputLayer_
bp_
<
T
><<<
std
::
min
(
nRows
,
(
Int
)
32768
),
std
::
min
(
nPlanes
,
(
Int
)
32
)
>>>
(
doF
,
diF
,
nRows
,
maxActive
,
nPlanes
,
r
b
,
false
);
}
d_input_features
,
d_output_features
,
nRows
,
maxActive
,
nPlanes
,
r
ules_gpu
,
average
);
}
sparseconvnet/SCN/CUDA/IOLayers.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_IOLAYERS_H
#define CUDA_IOLAYERS_H
template
<
typename
T
>
__global__
void
InputLayer_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
output_features
+
row
*
nPlanes
;
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
out
[
plane
]
+=
multiplier
*
inp
[
plane
];
}
}
}
template
<
typename
T
>
__global__
void
InputLayer_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
nRows
;
row
+=
gridDim
.
x
)
{
T
*
out
=
d_output_features
+
row
*
nPlanes
;
Int
*
r
=
rules
+
row
*
(
1
+
maxActive
);
Int
nActive
=
r
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
int
i
=
1
;
i
<=
nActive
;
i
++
)
{
T
*
inp
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
blockDim
.
x
)
atomicAdd
(
&
inp
[
plane
],
multiplier
*
out
[
plane
]);
}
}
}
#endif
/* CUDA_IOLAYERS_H */
sparseconvnet/SCN/CUDA/LeakyReLU.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
);
template
<
typename
T
>
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
Int
n
,
T
alpha
);
template
<
typename
T
>
void
cuda_LeakyReLU_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
T
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
n
=
input_features
.
numel
();
LeakyReLU_fp
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
n
,
alpha
);
}
template
<
typename
T
>
void
cuda_LeakyReLU_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
T
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
n
=
d_input_features
.
numel
();
LeakyReLU_bp
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
n
,
alpha
);
}
sparseconvnet/SCN/CUDA/LeakyReLU.cu
View file @
de3743f6
...
...
@@ -4,26 +4,28 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template
<
typename
T
>
void
cuda_LeakyReLU_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
n
=
input_features
.
numel
();
LeakyReLU_fp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
n
,
alpha
);
__global__
void
LeakyReLU_fp_
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
LeakyReLU_fp_
<
T
><<<
16
,
1024
>>>
(
input_features
,
output_features
,
n
,
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp_
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
void
cuda_LeakyReLU_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
n
=
d_input_features
.
numel
();
LeakyReLU_bp
<
T
><<<
16
,
1024
>>>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
n
,
alpha
);
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
LeakyReLU_bp_
<
T
><<<
16
,
1024
>>>
(
input_features
,
d_input_features
,
output_features
,
n
,
alpha
);
}
sparseconvnet/SCN/CUDA/LeakyReLU.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef LEAKYRELU_H
#define LEAKYRELU_H
template
<
typename
T
>
__global__
void
LeakyReLU_fp
(
T
*
input_features
,
T
*
output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
output_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
input_features
[
i
]
:
(
input_features
[
i
]
*
alpha
);
}
template
<
typename
T
>
__global__
void
LeakyReLU_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
Int
n
,
T
alpha
)
{
for
(
Int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
n
;
i
+=
16
*
1024
)
d_input_features
[
i
]
=
(
input_features
[
i
]
>
0
)
?
d_output_features
[
i
]
:
(
d_output_features
[
i
]
*
alpha
);
}
#endif
sparseconvnet/SCN/CUDA/MaxPooling.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cuda_MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
);
template
<
typename
T
>
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
_rules
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
_rules
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
_rules
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
_rules
);
}
sparseconvnet/SCN/CUDA/MaxPooling.cu
View file @
de3743f6
...
...
@@ -4,100 +4,74 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
}
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_
MaxPooling_
BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
)
,
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
template
<
typename
T
>
void
cuda_MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
)
{
RULEBOOKITERATOR
(
(
MaxPooling_
fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>
>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rbB
,
nHotB
)
)
;
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
);
,
)
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
);
template
<
typename
T
>
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
)
{
RULEBOOKITERATOR
((
MaxPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rbB
,
nHotB
));
,
)
}
sparseconvnet/SCN/CUDA/MaxPooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_MAXPOOLING_H
#define CUDA_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
{
T
inp
=
input_features
[
i
+
plane
];
if
(
output_features
[
o
+
plane
]
<
inp
)
output_features
[
o
+
plane
]
=
inp
;
}
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
MaxPooling_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
MaxPooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
);
}
#endif
/* CUDA_MAXPOOLING_H */
sparseconvnet/SCN/CUDA/NetworkInNetwork.c
u
→
sparseconvnet/SCN/CUDA/NetworkInNetwork.c
pp
View file @
de3743f6
...
...
@@ -4,8 +4,6 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm>
template
<
typename
T
>
...
...
sparseconvnet/SCN/CUDA/SparseToDense.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cuda_SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
RuleBook
_rules
);
template
<
typename
T
>
void
cuda_SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
RuleBook
_rules
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
cuda_SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
_rules
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
cuda_SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
_rules
);
}
}
sparseconvnet/SCN/CUDA/SparseToDense.cu
View file @
de3743f6
...
...
@@ -4,53 +4,66 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
RULEBOOKITERATOR
(
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
oF
+=
_nPlanes
*
spatialVolume
;)
__syncthreads
();
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
template
<
typename
T
>
void
cuda_SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
RuleBook
_rules
)
{
RULEBOOKITERATOR
((
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
));
,
output_features
+=
nPlanes
*
spatialVolume
;)
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
rbB
,
nHotB
);
,
doF
+=
_nPlanes
*
spatialVolume
;)
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
RuleBook
_rules
)
{
RULEBOOKITERATOR
((
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rbB
,
nHotB
));
,
d_output_features
+=
nPlanes
*
spatialVolume
;)
}
sparseconvnet/SCN/CUDA/SparseToDense.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
i
=
input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
o
=
output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
SparseToDense_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
SparseToDense_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
T
*
d_i
=
d_input_features
+
r
[
2
*
threadIdx
.
y
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
r
[
2
*
threadIdx
.
y
+
1
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
Int
nHot
)
{
SparseToDense_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
spatialVolume
,
rules
,
nHot
);
}
#endif
/* CUDA_SPARSETODENSE_H */
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment