Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
f9552033
"git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "88080ee943b2b769557488e9c60850da96ab839e"
Commit
f9552033
authored
Jul 16, 2017
by
Benjamin Thomas Graham
Browse files
initial commit
parents
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2602 additions
and
0 deletions
+2602
-0
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
+61
-0
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
+36
-0
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
+124
-0
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
+128
-0
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+64
-0
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+36
-0
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
...h/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
+63
-0
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
+37
-0
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
+69
-0
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
+57
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
...seconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+51
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
...rseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+532
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
+58
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
+76
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+92
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
+213
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
...convnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
+75
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
...econvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+53
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+153
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+624
-0
No files found.
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
#else
#include "MaxPooling.h"
extern
"C"
void
scn_DR_
(
MaxPooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
output_features
,
long
nFeaturesToDrop
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THTensor_
(
resize2d
)(
output_features
,
nActive
,
input_features
->
size
[
1
]
-
nFeaturesToDrop
);
THTensor_
(
zero
)(
output_features
);
auto
iF
=
THTensor_
(
data
)(
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THTensor_
(
data
)(
output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
real
>
(
iF
,
oF
,
nPlanes
,
input_features
->
stride
[
0
],
output_features
->
stride
[
0
],
&
r
[
0
],
nHot
);
}
}
extern
"C"
void
scn_DR_
(
MaxPooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
d_input_features
,
THTensor
*
output_features
,
THTensor
*
d_output_features
,
long
nFeaturesToDrop
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THTensor_
(
resizeAs
)(
d_input_features
,
input_features
);
THTensor_
(
zero
)(
d_input_features
);
auto
iF
=
THTensor_
(
data
)(
input_features
);
auto
oF
=
THTensor_
(
data
)(
output_features
);
auto
diF
=
THTensor_
(
data
)(
d_input_features
);
auto
doF
=
THTensor_
(
data
)(
d_output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
real
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
->
stride
[
0
],
output_features
->
stride
[
0
],
&
r
[
0
],
nHot
);
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
uInt
i
=
rules
[
2
*
outSite
]
*
input_stride
;
uInt
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
uInt
i
=
rules
[
2
*
outSite
]
*
input_stride
;
uInt
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_MAXPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
#else
extern
"C"
double
scn_R_
(
NetworkInNetwork_updateOutput
)(
THTensor
*
input_features_
,
THTensor
*
output_features_
,
THTensor
*
weight_
,
THTensor
*
bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THTensor_
(
resize2d
)(
output_features_
,
nActive
,
output_nPlanes
);
auto
input_features
=
THTensor_
(
data
)(
input_features_
);
auto
output_features
=
THTensor_
(
data
)(
output_features_
);
auto
weight
=
THTensor_
(
data
)(
weight_
);
if
(
bias_
!=
nullptr
)
{
// Set bias
auto
bias
=
THTensor_
(
data
)(
bias_
);
for
(
uInt
row
=
0
;
row
<
nActive
;
row
++
)
for
(
uInt
column
=
0
;
column
<
output_nPlanes
;
column
++
)
output_features
[
row
*
output_nPlanes
+
column
]
=
bias
[
column
];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
THBlas_
(
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
THTensor_
(
zero
)(
output_features_
);
THBlas_
(
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_updateGradInput
)(
THTensor
*
d_input_features_
,
THTensor
*
d_output_features_
,
THTensor
*
weight_
)
{
auto
nActive
=
d_output_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THTensor_
(
resize2d
)(
d_input_features_
,
nActive
,
input_nPlanes
);
THTensor_
(
zero
)(
d_input_features_
);
auto
d_input_features
=
THTensor_
(
data
)(
d_input_features_
);
auto
d_output_features
=
THTensor_
(
data
)(
d_output_features_
);
auto
weight
=
THTensor_
(
data
)(
weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
THBlas_
(
gemm
)(
't'
,
'n'
,
input_nPlanes
,
// r
nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_input_features
,
input_nPlanes
// r
);
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_accGradParameters
)(
THTensor
*
input_features_
,
THTensor
*
d_output_features_
,
THTensor
*
d_weight_
,
THTensor
*
d_bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
d_weight_
->
size
[
0
];
auto
output_nPlanes
=
d_weight_
->
size
[
1
];
auto
input_features
=
THTensor_
(
data
)(
input_features_
);
auto
d_output_features
=
THTensor_
(
data
)(
d_output_features_
);
auto
d_weight
=
THTensor_
(
data
)(
d_weight_
);
auto
d_bias
=
d_bias_
and
THTensor_
(
data
)(
d_bias_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
THBlas_
(
gemm
)(
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
,
// l
nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias_
)
{
auto
d_bias
=
THTensor_
(
data
)(
d_bias_
);
for
(
uInt
row
=
0
;
row
<
nActive
;
row
++
)
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
++
)
d_bias
[
i
]
+=
d_output_features
[
row
*
output_nPlanes
+
i
];
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_NetworkInNetwork_H
#define CPU_NetworkInNetwork_H
#include "../SparseConvNet.h"
#include "Convolution.h"
// buffer must have size >= output_nActive * filterVolume * input_nPlanes
template
<
typename
T
>
void
NetworkInNetwork_ForwardPass
(
T
*
input_features
,
uInt
input_nPlanes
,
T
*
output_features
,
uInt
output_nPlanes
,
T
*
weight
,
T
*
bias
,
uInt
output_nActive
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
if
(
bias
!=
nullptr
)
{
// Set bias
for
(
uInt
row
=
0
;
row
<
output_nActive
;
row
++
)
for
(
uInt
column
=
0
;
column
<
output_nPlanes
;
column
++
)
output_features
[
row
*
output_nPlanes
+
column
]
=
bias
[
column
];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
(
*
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
output_nActive
,
// l
input_nPlanes
*
filterVolume
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
(
*
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
output_nActive
,
// l
input_nPlanes
*
filterVolume
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
}
template
<
typename
T
>
void
NetworkInNetwork_BackwardPass
(
T
*
d_input_features
,
uInt
input_nPlanes
,
T
*
d_output_features
,
uInt
output_nPlanes
,
T
*
weight
,
uInt
*
rules
,
uInt
filterVolume
,
uInt
output_nActive
,
T
*
d_buffer
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
(
*
gemm
)(
't'
,
'n'
,
input_nPlanes
*
filterVolume
,
// r
output_nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_buffer
,
input_nPlanes
*
filterVolume
// r
);
// Use rules and d_buffer to accumulate gradient information into d_input
for
(
uInt
row
=
0
;
row
<
output_nActive
*
filterVolume
;
row
++
)
{
auto
r
=
rules
[
row
];
if
(
r
!=
uInt_MAX
)
// 2^32-1
for
(
uInt
i
=
0
;
i
<
input_nPlanes
;
i
++
)
d_input_features
[
r
*
input_nPlanes
+
i
]
+=
d_buffer
[
row
*
input_nPlanes
+
i
];
}
}
template
<
typename
T
>
void
NetworkInNetwork_GradWeights
(
T
*
input_features
,
uInt
input_nPlanes
,
T
*
d_output_features
,
uInt
output_nPlanes
,
T
*
d_weight
,
T
*
d_bias
,
uInt
*
rules
,
uInt
filterVolume
,
uInt
output_nActive
,
T
*
buffer
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
// d_weight
// Use input_features and rules to fill buffer
for
(
uInt
row
=
0
;
row
<
output_nActive
*
filterVolume
;
row
++
)
{
if
(
rules
[
row
]
==
uInt_MAX
)
{
// 2^32-1
std
::
memset
(
buffer
+
row
*
input_nPlanes
,
0
,
sizeof
(
T
)
*
input_nPlanes
);
}
else
{
std
::
memcpy
(
buffer
+
row
*
input_nPlanes
,
input_features
+
rules
[
row
]
*
input_nPlanes
,
sizeof
(
T
)
*
input_nPlanes
);
}
}
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
(
*
gemm
)(
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
*
filterVolume
,
// l
output_nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias
)
for
(
uInt
row
=
0
;
row
<
output_nActive
;
row
++
)
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
++
)
d_bias
[
i
]
+=
d_output_features
[
row
*
output_nPlanes
+
i
];
}
#endif
/* CPU_NetworkInNetwork_H */
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
#else
#include "SparseToDense.h"
extern
"C"
void
scn_DR_
(
SparseToDense_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
output_features
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
{
long
sz
[
Dimension
+
2
];
sz
[
0
]
=
_m
.
inputSGs
->
size
();
sz
[
1
]
=
input_features
->
size
[
1
];
for
(
int
i
=
0
;
i
<
Dimension
;
i
++
)
{
auto
x
=
THLongTensor_data
(
inputSize
)[
i
];
sz
[
i
+
2
]
=
x
;
}
THTensor_
(
resizeNd
)(
output_features
,
Dimension
+
2
,
sz
,
NULL
);
THTensor_
(
zero
)(
output_features
);
}
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
iF
=
THTensor_
(
data
)(
input_features
);
auto
oF
=
THTensor_
(
data
)(
output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
SparseToDense_ForwardPass
<
real
>
(
iF
,
oF
,
nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
oF
+=
spatialVolume
;
}
}
extern
"C"
void
scn_DR_
(
SparseToDense_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
d_input_features
,
THTensor
*
d_output_features
,
void
*
rulesBuffer
)
{
THTensor_
(
resizeAs
)(
d_input_features
,
input_features
);
THTensor_
(
zero
)(
d_input_features
);
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
d_input_features
->
size
[
1
];
auto
diF
=
THTensor_
(
data
)(
d_input_features
);
auto
doF
=
THTensor_
(
data
)(
d_output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
SparseToDense_BackwardPass
<
real
>
(
diF
,
doF
,
nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
doF
+=
spatialVolume
;
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
int
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
&
input_features
[
rules
[
2
*
outSite
]
*
nPlanes
];
uInt
sample
=
rules
[
2
*
outSite
+
1
];
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[(
sample
*
nPlanes
+
plane
)
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
int
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
di
=
&
d_input_features
[
rules
[
2
*
outSite
]
*
nPlanes
];
uInt
sample
=
rules
[
2
*
outSite
+
1
];
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
di
[
plane
]
=
d_output_features
[(
sample
*
nPlanes
+
plane
)
*
spatialVolume
];
}
}
#endif
/* CPU_SPARSETODENSE_H */
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
#endif
#define real float
#define accreal double
#define Real Float
#define TH_REAL_IS_FLOAT
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_FLOAT
#define real double
#define accreal double
#define Real Double
#define TH_REAL_IS_DOUBLE
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_DOUBLE
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
#else
#include "ActivePooling.h"
extern
"C"
void
scn_DR_
(
ActivePooling_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCITensor
*
rulesBuffer
,
bool
average
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
_rules
=
_m
.
getActivePoolingRuleBook
(
inputSize
);
uInt
batchSize
=
_rules
[
1
][
0
];
uInt
maxActive
=
_rules
[
1
][
1
];
THCTensor_
(
resize2d
)(
state
,
output_features
,
batchSize
,
nPlanes
);
THCTensor_
(
zero
)(
state
,
output_features
);
if
(
THCITensor_nElement
(
state
,
rulesBuffer
)
<
1
<<
22
)
THCITensor_resize1d
(
state
,
rulesBuffer
,
1
<<
22
);
uInt
*
rb
=
(
uInt
*
)
THCITensor_data
(
state
,
rulesBuffer
);
uInt
rowBatchSize
=
std
::
min
((
uInt
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
THAssert
(
rowBatchSize
>
0
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
for
(
uInt
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
uInt
batchSize_
=
std
::
min
(
rowBatchSize
,
(
uInt
)(
batchSize
-
o
));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
uInt
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
real
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
extern
"C"
void
scn_DR_
(
ActivePooling_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCITensor
*
rulesBuffer
,
bool
average
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
_rules
=
_m
.
getActivePoolingRuleBook
(
inputSize
);
uInt
batchSize
=
_rules
[
1
][
0
];
uInt
maxActive
=
_rules
[
1
][
1
];
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
if
(
THCITensor_nElement
(
state
,
rulesBuffer
)
<
1
<<
22
)
THCITensor_resize1d
(
state
,
rulesBuffer
,
1
<<
22
);
uInt
*
rb
=
(
uInt
*
)
THCITensor_data
(
state
,
rulesBuffer
);
uInt
rowBatchSize
=
std
::
min
((
uInt
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
THAssert
(
rowBatchSize
>
0
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
for
(
uInt
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
uInt
batchSize_
=
std
::
min
(
rowBatchSize
,
(
uInt
)(
batchSize
-
o
));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
uInt
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
real
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_ACTIVEPOOLING_H
#define GPU_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
uInt
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
uInt
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_fp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
uInt
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
uInt
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_bp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/* GPU_ActivePOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
#else
#include "AffineReluTrivialConvolution.h"
#include <algorithm>
#include <iostream>
extern
"C"
void
scn_R_
(
AffineReluTrivialConvolution_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
affineWeight
,
THCTensor
*
affineBias
,
THCTensor
*
convWeight
)
{
THCTensor_
(
resize2d
)(
state
,
output_features
,
input_features
->
size
[
0
],
convWeight
->
size
[
1
]);
dAffineReluTrivialConvolution_forward
<
real
>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
output_features
),
THCTensor_
(
data
)(
state
,
affineWeight
),
THCTensor_
(
data
)(
state
,
affineBias
),
THCTensor_
(
data
)(
state
,
convWeight
),
convWeight
->
size
[
0
],
input_features
->
stride
[
0
],
convWeight
->
size
[
1
],
output_features
->
size
[
1
],
input_features
->
size
[
0
]);
}
extern
"C"
void
scn_R_
(
AffineReluTrivialConvolution_backward
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
affineWeight
,
THCTensor
*
d_affineWeight
,
THCTensor
*
affineBias
,
THCTensor
*
d_affineBias
,
THCTensor
*
convWeight
,
THCTensor
*
d_convWeight
,
bool
additiveGrad
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
real
>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
d_input_features
),
THCTensor_
(
data
)(
state
,
d_output_features
),
THCTensor_
(
data
)(
state
,
affineWeight
),
THCTensor_
(
data
)(
state
,
d_affineWeight
),
THCTensor_
(
data
)(
state
,
affineBias
),
THCTensor_
(
data
)(
state
,
d_affineBias
),
THCTensor_
(
data
)(
state
,
convWeight
),
THCTensor_
(
data
)(
state
,
d_convWeight
),
convWeight
->
size
[
0
],
input_features
->
stride
[
0
],
convWeight
->
size
[
1
],
d_output_features
->
stride
[
0
],
input_features
->
size
[
0
],
additiveGrad
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
{
const
uInt
K
=
64
;
const
uInt
V
=
16
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
32
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
16
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
8
;
const
uInt
V
=
2
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
assert
(
false
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
{
const
uInt
K
=
32
;
const
uInt
V
=
8
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
{
const
uInt
K
=
16
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
{
const
uInt
K
=
8
;
const
uInt
V
=
2
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
#else
#include "AveragePooling.h"
#include "RuleBookIterator.h"
extern
"C"
void
scn_DR_
(
AveragePooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
input_features
->
size
[
1
]
-
nFeaturesToDrop
);
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
AveragePooling_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
input_features
->
size
[
1
],
output_features
->
size
[
1
],
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
extern
"C"
void
scn_DR_
(
AveragePooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
)
+
nFeaturesToDrop
;
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
AveragePooling_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
diF
,
doF
,
nPlanes
,
input_features
->
size
[
1
],
d_output_features
->
size
[
1
],
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AVERAGEPOOLING_H
#define GPU_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
T
alpha
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
template
<
typename
T
>
void
AveragePooling_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
uInt
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
T
alpha
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
AveragePooling_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
uInt
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/* GPU_AVERAGEPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
#else
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train, \
leakiness); \
}
extern
"C"
void
scn_R_
(
BatchNormalization_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
real
eps
,
real
momentum
,
bool
train
,
real
leakiness
)
{
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
extern
"C"
void
scn_R_
(
BatchNormalizationInTensor_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
real
eps
,
real
momentum
,
bool
train
,
real
leakiness
)
{
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, d_output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, \
d_weight ? THCTensor_(data)(state, d_weight) : 0, \
d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness); \
}
extern
"C"
void
scn_R_
(
BatchNormalization_backward
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
output_features
,
THCTensor
*
d_output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
real
leakiness
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHNORMALIZATION_H
#define GPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
uInt
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
uInt
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#undef NTX
#undef NTY
#endif
/* GPU_BATCHNORMALIZATION_H */
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
12
,
64
)
SPARSECONVNET_FOO
(
8
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
12
,
64
)
SPARSECONVNET_FOO
(
8
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
uInt
nActive
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
uInt
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
uInt
i
=
row
*
input_stride
+
plane
;
uInt
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
uInt
nActive
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
uInt
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
uInt
i
=
row
*
input_stride
+
plane
;
uInt
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#endif
/* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
#else
#include "Convolution.h"
#include "RuleBookIterator.h"
#include <algorithm>
#include <cstring>
extern
"C"
double
scn_DR_
(
Convolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
Convolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
extern
"C"
double
scn_DR_
(
ValidConvolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
filterSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getValidRuleBook
(
inputSize
,
filterSize
,
true
);
uInt
nActive
=
input_features
->
size
[
0
];
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
ValidConvolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
filterSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getValidRuleBook
(
inputSize
,
filterSize
,
true
);
uInt
nActive
=
input_features
->
size
[
0
];
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_CONVOLUTION_H
#define GPU_CONVOLUTION_H
#include "../SparseConvNet.h"
template
<
typename
T
>
__global__
void
Convolution_fp_bias
(
T
*
output_features
,
T
*
bias
,
uInt
nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
__shared__
T
b
[
32
];
b
[
threadIdx
.
x
]
=
bias
[
threadIdx
.
x
];
for
(
uInt
row
=
blockIdx
.
x
;
row
<
nActive
;
row
+=
1
<<
12
)
{
output_features
[
row
*
output_stride
+
threadIdx
.
x
]
=
b
[
threadIdx
.
x
];
}
}
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
uInt
nRows
,
uInt
nColumns
,
uInt
nCOLUMNS
)
{
uInt
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
t
=
0
;
for
(
uInt
j
=
blockIdx
.
y
;
j
<
nRows
;
j
+=
32
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
atomicAdd
(
&
target
[
i
],
t
);
}
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
uInt
nRows
,
uInt
nColumns
,
uInt
nCOLUMNS
,
cudaStream_t
stream
)
{
if
(
nColumns
/
32
>
0
)
dColumnSum
<<
<
dim3
(
nColumns
/
32
,
32
),
32
,
0
,
stream
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
nCOLUMNS
);
if
(
nColumns
%
32
>
0
)
{
uInt
o
=
nColumns
/
32
*
32
;
dColumnSum
<<
<
dim3
(
1
,
32
),
nColumns
-
o
,
0
,
stream
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
64
,
16
)
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
uInt
R0
[
V
];
uInt
R1
[
V
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
FOO
(
32
,
8
)
FOO
(
16
,
4
)
FOO
(
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
uInt
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
uInt
R
[
K
*
2
];
const
int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
uInt
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
uInt
*
rules
,
uInt
nHot
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
cudaStream_t
stream
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<
<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
),
0
,
stream
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
return
;
}
else
{
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rules
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
stream
);
}
}
#endif
/* GPU_CONVOLUTION_H */
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment