Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
f9552033
Commit
f9552033
authored
Jul 16, 2017
by
Benjamin Thomas Graham
Browse files
initial commit
parents
Changes
168
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1978 additions
and
0 deletions
+1978
-0
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
+61
-0
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
+36
-0
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
+124
-0
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
+128
-0
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+64
-0
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+36
-0
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
...h/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
+63
-0
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
+37
-0
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
+69
-0
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
+57
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
...seconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+51
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
...rseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+532
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
+58
-0
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
+76
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+92
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
+213
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
...convnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
+75
-0
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
...econvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+53
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+153
-0
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+0
-0
No files found.
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
#else
#include "MaxPooling.h"
extern
"C"
void
scn_DR_
(
MaxPooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
output_features
,
long
nFeaturesToDrop
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THTensor_
(
resize2d
)(
output_features
,
nActive
,
input_features
->
size
[
1
]
-
nFeaturesToDrop
);
THTensor_
(
zero
)(
output_features
);
auto
iF
=
THTensor_
(
data
)(
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THTensor_
(
data
)(
output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
real
>
(
iF
,
oF
,
nPlanes
,
input_features
->
stride
[
0
],
output_features
->
stride
[
0
],
&
r
[
0
],
nHot
);
}
}
extern
"C"
void
scn_DR_
(
MaxPooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
d_input_features
,
THTensor
*
output_features
,
THTensor
*
d_output_features
,
long
nFeaturesToDrop
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THTensor_
(
resizeAs
)(
d_input_features
,
input_features
);
THTensor_
(
zero
)(
d_input_features
);
auto
iF
=
THTensor_
(
data
)(
input_features
);
auto
oF
=
THTensor_
(
data
)(
output_features
);
auto
diF
=
THTensor_
(
data
)(
d_input_features
);
auto
doF
=
THTensor_
(
data
)(
d_output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
real
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
->
stride
[
0
],
output_features
->
stride
[
0
],
&
r
[
0
],
nHot
);
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
uInt
i
=
rules
[
2
*
outSite
]
*
input_stride
;
uInt
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
uInt
i
=
rules
[
2
*
outSite
]
*
input_stride
;
uInt
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_MAXPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
#else
extern
"C"
double
scn_R_
(
NetworkInNetwork_updateOutput
)(
THTensor
*
input_features_
,
THTensor
*
output_features_
,
THTensor
*
weight_
,
THTensor
*
bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THTensor_
(
resize2d
)(
output_features_
,
nActive
,
output_nPlanes
);
auto
input_features
=
THTensor_
(
data
)(
input_features_
);
auto
output_features
=
THTensor_
(
data
)(
output_features_
);
auto
weight
=
THTensor_
(
data
)(
weight_
);
if
(
bias_
!=
nullptr
)
{
// Set bias
auto
bias
=
THTensor_
(
data
)(
bias_
);
for
(
uInt
row
=
0
;
row
<
nActive
;
row
++
)
for
(
uInt
column
=
0
;
column
<
output_nPlanes
;
column
++
)
output_features
[
row
*
output_nPlanes
+
column
]
=
bias
[
column
];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
THBlas_
(
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
THTensor_
(
zero
)(
output_features_
);
THBlas_
(
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
nActive
,
// l
input_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_updateGradInput
)(
THTensor
*
d_input_features_
,
THTensor
*
d_output_features_
,
THTensor
*
weight_
)
{
auto
nActive
=
d_output_features_
->
size
[
0
];
auto
input_nPlanes
=
weight_
->
size
[
0
];
auto
output_nPlanes
=
weight_
->
size
[
1
];
THTensor_
(
resize2d
)(
d_input_features_
,
nActive
,
input_nPlanes
);
THTensor_
(
zero
)(
d_input_features_
);
auto
d_input_features
=
THTensor_
(
data
)(
d_input_features_
);
auto
d_output_features
=
THTensor_
(
data
)(
d_output_features_
);
auto
weight
=
THTensor_
(
data
)(
weight_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
THBlas_
(
gemm
)(
't'
,
'n'
,
input_nPlanes
,
// r
nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_input_features
,
input_nPlanes
// r
);
}
extern
"C"
void
scn_R_
(
NetworkInNetwork_accGradParameters
)(
THTensor
*
input_features_
,
THTensor
*
d_output_features_
,
THTensor
*
d_weight_
,
THTensor
*
d_bias_
)
{
auto
nActive
=
input_features_
->
size
[
0
];
auto
input_nPlanes
=
d_weight_
->
size
[
0
];
auto
output_nPlanes
=
d_weight_
->
size
[
1
];
auto
input_features
=
THTensor_
(
data
)(
input_features_
);
auto
d_output_features
=
THTensor_
(
data
)(
d_output_features_
);
auto
d_weight
=
THTensor_
(
data
)(
d_weight_
);
auto
d_bias
=
d_bias_
and
THTensor_
(
data
)(
d_bias_
);
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
THBlas_
(
gemm
)(
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
,
// l
nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
input_features
,
input_nPlanes
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias_
)
{
auto
d_bias
=
THTensor_
(
data
)(
d_bias_
);
for
(
uInt
row
=
0
;
row
<
nActive
;
row
++
)
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
++
)
d_bias
[
i
]
+=
d_output_features
[
row
*
output_nPlanes
+
i
];
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_NetworkInNetwork_H
#define CPU_NetworkInNetwork_H
#include "../SparseConvNet.h"
#include "Convolution.h"
// buffer must have size >= output_nActive * filterVolume * input_nPlanes
template
<
typename
T
>
void
NetworkInNetwork_ForwardPass
(
T
*
input_features
,
uInt
input_nPlanes
,
T
*
output_features
,
uInt
output_nPlanes
,
T
*
weight
,
T
*
bias
,
uInt
output_nActive
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
if
(
bias
!=
nullptr
)
{
// Set bias
for
(
uInt
row
=
0
;
row
<
output_nActive
;
row
++
)
for
(
uInt
column
=
0
;
column
<
output_nPlanes
;
column
++
)
output_features
[
row
*
output_nPlanes
+
column
]
=
bias
[
column
];
// Do GEMM (note: gemm assumes column-major matrices)
// buffer is l*m (row-major)
// weight is r*m (row-major)
// output_features is l*r (row-major)
// buffer * T(weights) + bias -> output_features
(
*
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
output_nActive
,
// l
input_nPlanes
*
filterVolume
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// m
1
,
// beta
output_features
,
output_nPlanes
// r
);
}
else
{
(
*
gemm
)(
'n'
,
'n'
,
output_nPlanes
,
// r
output_nActive
,
// l
input_nPlanes
*
filterVolume
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// m
0
,
// beta
output_features
,
output_nPlanes
// r
);
}
}
template
<
typename
T
>
void
NetworkInNetwork_BackwardPass
(
T
*
d_input_features
,
uInt
input_nPlanes
,
T
*
d_output_features
,
uInt
output_nPlanes
,
T
*
weight
,
uInt
*
rules
,
uInt
filterVolume
,
uInt
output_nActive
,
T
*
d_buffer
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is l*m (row-major)
// weights is m*r (row-major)
// d_buffer is l*r (row-major)
// d_output_features * weight -> d_buffer
(
*
gemm
)(
't'
,
'n'
,
input_nPlanes
*
filterVolume
,
// r
output_nActive
,
// l
output_nPlanes
,
// m
1
,
// alpha
weight
,
output_nPlanes
,
// m
d_output_features
,
output_nPlanes
,
// m
0
,
// beta
d_buffer
,
input_nPlanes
*
filterVolume
// r
);
// Use rules and d_buffer to accumulate gradient information into d_input
for
(
uInt
row
=
0
;
row
<
output_nActive
*
filterVolume
;
row
++
)
{
auto
r
=
rules
[
row
];
if
(
r
!=
uInt_MAX
)
// 2^32-1
for
(
uInt
i
=
0
;
i
<
input_nPlanes
;
i
++
)
d_input_features
[
r
*
input_nPlanes
+
i
]
+=
d_buffer
[
row
*
input_nPlanes
+
i
];
}
}
template
<
typename
T
>
void
NetworkInNetwork_GradWeights
(
T
*
input_features
,
uInt
input_nPlanes
,
T
*
d_output_features
,
uInt
output_nPlanes
,
T
*
d_weight
,
T
*
d_bias
,
uInt
*
rules
,
uInt
filterVolume
,
uInt
output_nActive
,
T
*
buffer
,
void
(
*
gemm
)(
char
transa
,
char
transb
,
long
m
,
long
n
,
long
k
,
T
alpha
,
T
*
a
,
long
lda
,
T
*
b
,
long
ldb
,
T
beta
,
T
*
c
,
long
ldc
))
{
// d_weight
// Use input_features and rules to fill buffer
for
(
uInt
row
=
0
;
row
<
output_nActive
*
filterVolume
;
row
++
)
{
if
(
rules
[
row
]
==
uInt_MAX
)
{
// 2^32-1
std
::
memset
(
buffer
+
row
*
input_nPlanes
,
0
,
sizeof
(
T
)
*
input_nPlanes
);
}
else
{
std
::
memcpy
(
buffer
+
row
*
input_nPlanes
,
input_features
+
rules
[
row
]
*
input_nPlanes
,
sizeof
(
T
)
*
input_nPlanes
);
}
}
// Do GEMM (note: gemm assumes column-major matrices)
// d_output_features is m*l (row-major)
// buffer is m*r (row-major)
// weights is l*r (row-major)
// T(d_output_features) * buffer -> d_weight
(
*
gemm
)(
'n'
,
't'
,
output_nPlanes
,
// r
input_nPlanes
*
filterVolume
,
// l
output_nActive
,
// m
1
,
// alpha
d_output_features
,
output_nPlanes
,
// r
buffer
,
input_nPlanes
*
filterVolume
,
// l
1
,
// beta
d_weight
,
output_nPlanes
// r
);
if
(
d_bias
)
for
(
uInt
row
=
0
;
row
<
output_nActive
;
row
++
)
for
(
uInt
i
=
0
;
i
<
output_nPlanes
;
i
++
)
d_bias
[
i
]
+=
d_output_features
[
row
*
output_nPlanes
+
i
];
}
#endif
/* CPU_NetworkInNetwork_H */
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
#else
#include "SparseToDense.h"
extern
"C"
void
scn_DR_
(
SparseToDense_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
output_features
,
void
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
{
long
sz
[
Dimension
+
2
];
sz
[
0
]
=
_m
.
inputSGs
->
size
();
sz
[
1
]
=
input_features
->
size
[
1
];
for
(
int
i
=
0
;
i
<
Dimension
;
i
++
)
{
auto
x
=
THLongTensor_data
(
inputSize
)[
i
];
sz
[
i
+
2
]
=
x
;
}
THTensor_
(
resizeNd
)(
output_features
,
Dimension
+
2
,
sz
,
NULL
);
THTensor_
(
zero
)(
output_features
);
}
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
iF
=
THTensor_
(
data
)(
input_features
);
auto
oF
=
THTensor_
(
data
)(
output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
SparseToDense_ForwardPass
<
real
>
(
iF
,
oF
,
nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
oF
+=
spatialVolume
;
}
}
extern
"C"
void
scn_DR_
(
SparseToDense_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THTensor
*
input_features
,
THTensor
*
d_input_features
,
THTensor
*
d_output_features
,
void
*
rulesBuffer
)
{
THTensor_
(
resizeAs
)(
d_input_features
,
input_features
);
THTensor_
(
zero
)(
d_input_features
);
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
auto
spatialVolume
=
_rules
.
size
();
uInt
nPlanes
=
d_input_features
->
size
[
1
];
auto
diF
=
THTensor_
(
data
)(
d_input_features
);
auto
doF
=
THTensor_
(
data
)(
d_output_features
);
for
(
auto
&
r
:
_rules
)
{
uInt
nHot
=
r
.
size
()
/
2
;
SparseToDense_BackwardPass
<
real
>
(
diF
,
doF
,
nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
doF
+=
spatialVolume
;
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
int
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
&
input_features
[
rules
[
2
*
outSite
]
*
nPlanes
];
uInt
sample
=
rules
[
2
*
outSite
+
1
];
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[(
sample
*
nPlanes
+
plane
)
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
spatialVolume
,
uInt
*
rules
,
int
nHot
)
{
for
(
uInt
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
di
=
&
d_input_features
[
rules
[
2
*
outSite
]
*
nPlanes
];
uInt
sample
=
rules
[
2
*
outSite
+
1
];
for
(
uInt
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
di
[
plane
]
=
d_output_features
[(
sample
*
nPlanes
+
plane
)
*
spatialVolume
];
}
}
#endif
/* CPU_SPARSETODENSE_H */
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
#endif
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#define Dimension 1
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 2
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 3
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 4
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 5
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 6
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 7
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 8
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 9
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#define Dimension 10
#define TH_GENERIC_FILE TH_GENERIC_FILE_
#include "THGenerateFloatTypes.h"
#undef Dimension
#undef TH_GENERIC_FILE_
PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
#endif
#define real float
#define accreal double
#define Real Float
#define TH_REAL_IS_FLOAT
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_FLOAT
#define real double
#define accreal double
#define Real Double
#define TH_REAL_IS_DOUBLE
#line 1 TH_GENERIC_FILE
#include TH_GENERIC_FILE
#undef accreal
#undef real
#undef Real
#undef TH_REAL_IS_DOUBLE
#undef TH_GENERIC_FILE
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
#else
#include "ActivePooling.h"
extern
"C"
void
scn_DR_
(
ActivePooling_updateOutput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCITensor
*
rulesBuffer
,
bool
average
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
_rules
=
_m
.
getActivePoolingRuleBook
(
inputSize
);
uInt
batchSize
=
_rules
[
1
][
0
];
uInt
maxActive
=
_rules
[
1
][
1
];
THCTensor_
(
resize2d
)(
state
,
output_features
,
batchSize
,
nPlanes
);
THCTensor_
(
zero
)(
state
,
output_features
);
if
(
THCITensor_nElement
(
state
,
rulesBuffer
)
<
1
<<
22
)
THCITensor_resize1d
(
state
,
rulesBuffer
,
1
<<
22
);
uInt
*
rb
=
(
uInt
*
)
THCITensor_data
(
state
,
rulesBuffer
);
uInt
rowBatchSize
=
std
::
min
((
uInt
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
THAssert
(
rowBatchSize
>
0
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
for
(
uInt
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
uInt
batchSize_
=
std
::
min
(
rowBatchSize
,
(
uInt
)(
batchSize
-
o
));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
uInt
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
real
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
extern
"C"
void
scn_DR_
(
ActivePooling_updateGradInput
)(
THLongTensor
*
inputSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCITensor
*
rulesBuffer
,
bool
average
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
];
auto
_rules
=
_m
.
getActivePoolingRuleBook
(
inputSize
);
uInt
batchSize
=
_rules
[
1
][
0
];
uInt
maxActive
=
_rules
[
1
][
1
];
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
if
(
THCITensor_nElement
(
state
,
rulesBuffer
)
<
1
<<
22
)
THCITensor_resize1d
(
state
,
rulesBuffer
,
1
<<
22
);
uInt
*
rb
=
(
uInt
*
)
THCITensor_data
(
state
,
rulesBuffer
);
uInt
rowBatchSize
=
std
::
min
((
uInt
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
THAssert
(
rowBatchSize
>
0
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
for
(
uInt
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
uInt
batchSize_
=
std
::
min
(
rowBatchSize
,
(
uInt
)(
batchSize
-
o
));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
uInt
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
real
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_ACTIVEPOOLING_H
#define GPU_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
uInt
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
uInt
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_fp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
uInt
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
uInt
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_bp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/* GPU_ActivePOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
#else
#include "AffineReluTrivialConvolution.h"
#include <algorithm>
#include <iostream>
extern
"C"
void
scn_R_
(
AffineReluTrivialConvolution_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
affineWeight
,
THCTensor
*
affineBias
,
THCTensor
*
convWeight
)
{
THCTensor_
(
resize2d
)(
state
,
output_features
,
input_features
->
size
[
0
],
convWeight
->
size
[
1
]);
dAffineReluTrivialConvolution_forward
<
real
>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
output_features
),
THCTensor_
(
data
)(
state
,
affineWeight
),
THCTensor_
(
data
)(
state
,
affineBias
),
THCTensor_
(
data
)(
state
,
convWeight
),
convWeight
->
size
[
0
],
input_features
->
stride
[
0
],
convWeight
->
size
[
1
],
output_features
->
size
[
1
],
input_features
->
size
[
0
]);
}
extern
"C"
void
scn_R_
(
AffineReluTrivialConvolution_backward
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
affineWeight
,
THCTensor
*
d_affineWeight
,
THCTensor
*
affineBias
,
THCTensor
*
d_affineBias
,
THCTensor
*
convWeight
,
THCTensor
*
d_convWeight
,
bool
additiveGrad
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
real
>
(
THCTensor_
(
data
)(
state
,
input_features
),
THCTensor_
(
data
)(
state
,
d_input_features
),
THCTensor_
(
data
)(
state
,
d_output_features
),
THCTensor_
(
data
)(
state
,
affineWeight
),
THCTensor_
(
data
)(
state
,
d_affineWeight
),
THCTensor_
(
data
)(
state
,
affineBias
),
THCTensor_
(
data
)(
state
,
d_affineBias
),
THCTensor_
(
data
)(
state
,
convWeight
),
THCTensor_
(
data
)(
state
,
d_convWeight
),
convWeight
->
size
[
0
],
input_features
->
stride
[
0
],
convWeight
->
size
[
1
],
d_output_features
->
stride
[
0
],
input_features
->
size
[
0
],
additiveGrad
);
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
uInt
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
uInt
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
)
{
{
const
uInt
K
=
64
;
const
uInt
V
=
16
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
32
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
16
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
{
const
uInt
K
=
8
;
const
uInt
V
=
2
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_forwardA
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
outFeatures
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_forwardB
<
T
,
K
,
V
><<<
dim3
(
1
,
output_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
outFeatures
+
o
*
output_stride
,
affineWeight
,
affineBias
,
convWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
);
return
;
}
}
assert
(
false
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
uInt
K
,
uInt
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
uInt
N
=
output_nPlanes
/
K
;
uInt
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
uInt
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
uInt
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
uInt
input_nPlanes
,
uInt
input_stride
,
uInt
output_nPlanes
,
uInt
output_stride
,
uInt
nActive
,
bool
additiveGrad
)
{
{
const
uInt
K
=
32
;
const
uInt
V
=
8
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
{
const
uInt
K
=
16
;
const
uInt
V
=
4
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
{
const
uInt
K
=
8
;
const
uInt
V
=
2
;
if
(
input_nPlanes
%
K
==
0
and
output_nPlanes
%
K
==
0
)
{
uInt
o
=
(
nActive
/
K
)
*
K
;
if
(
o
>
0
)
dAffineReluTrivialConvolution_backward_dW_A
<
T
,
K
,
V
><<<
dim3
(
std
::
min
(
o
/
K
,
(
uInt
)
512
),
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
o
,
additiveGrad
);
if
(
nActive
>
o
)
dAffineReluTrivialConvolution_backward_dW_B
<
T
,
K
,
V
><<<
dim3
(
1
,
input_nPlanes
/
K
),
dim3
(
K
,
K
/
V
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
inFeatures
+
o
*
input_stride
,
dInFeatures
+
o
*
input_stride
,
dOutFeatures
+
o
*
output_stride
,
affineWeight
,
dAffineWeight
,
affineBias
,
dAffineBias
,
convWeight
,
dConvWeight
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nActive
-
o
,
additiveGrad
);
return
;
}
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
#else
#include "AveragePooling.h"
#include "RuleBookIterator.h"
extern
"C"
void
scn_DR_
(
AveragePooling_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
input_features
->
size
[
1
]
-
nFeaturesToDrop
);
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
)
+
nFeaturesToDrop
;
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
RULEBOOKITERATOR
(
AveragePooling_ForwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
iF
,
oF
,
nPlanes
,
input_features
->
size
[
1
],
output_features
->
size
[
1
],
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
extern
"C"
void
scn_DR_
(
AveragePooling_updateGradInput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
poolSize
,
THLongTensor
*
poolStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
long
nFeaturesToDrop
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
uInt
nPlanes
=
input_features
->
size
[
1
]
-
nFeaturesToDrop
;
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
)
+
nFeaturesToDrop
;
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
RULEBOOKITERATOR
(
AveragePooling_BackwardPass
<
real
>
(
THCState_getCurrentStream
(
state
),
diF
,
doF
,
nPlanes
,
input_features
->
size
[
1
],
d_output_features
->
size
[
1
],
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_AVERAGEPOOLING_H
#define GPU_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
T
alpha
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
template
<
typename
T
>
void
AveragePooling_ForwardPass
(
cudaStream_t
stream
,
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
uInt
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
T
alpha
)
{
__shared__
uInt
r
[
NTY
*
2
];
for
(
uInt
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
uInt
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
nHot
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
uInt
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
uInt
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
uInt
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
AveragePooling_BackwardPass
(
cudaStream_t
stream
,
T
*
d_input_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
uInt
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
),
0
,
stream
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/* GPU_AVERAGEPOOLING_H */
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
#else
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train, \
leakiness); \
}
extern
"C"
void
scn_R_
(
BatchNormalization_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
real
eps
,
real
momentum
,
bool
train
,
real
leakiness
)
{
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
extern
"C"
void
scn_R_
(
BatchNormalizationInTensor_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
real
eps
,
real
momentum
,
bool
train
,
real
leakiness
)
{
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<real, N, 64>( \
THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, d_output_features), nPlanes, input_stride, \
output_stride, nActive, THCTensor_(data)(state, saveMean), \
THCTensor_(data)(state, saveInvStd), \
THCTensor_(data)(state, runningMean), \
THCTensor_(data)(state, runningVar), \
weight ? THCTensor_(data)(state, weight) : 0, \
bias ? THCTensor_(data)(state, bias) : 0, \
d_weight ? THCTensor_(data)(state, d_weight) : 0, \
d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness); \
}
extern
"C"
void
scn_R_
(
BatchNormalization_backward
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
output_features
,
THCTensor
*
d_output_features
,
THCTensor
*
saveMean
,
THCTensor
*
saveInvStd
,
THCTensor
*
runningMean
,
THCTensor
*
runningVar
,
THCTensor
*
weight
,
THCTensor
*
bias
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
real
leakiness
)
{
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
auto
input_stride
=
input_features
->
stride
[
0
];
auto
output_stride
=
output_features
->
stride
[
0
];
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHNORMALIZATION_H
#define GPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
uInt
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
uInt
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
uInt
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
uInt
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
uInt
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
),
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#undef NTX
#undef NTY
#endif
/* GPU_BATCHNORMALIZATION_H */
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateOutput
)(
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
input_features
!=
output_features
)
THCTensor_
(
resizeAs
)(
state
,
output_features
,
input_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
12
,
64
)
SPARSECONVNET_FOO
(
8
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \
} \
}
extern
"C"
void
scn_R_
(
BatchwiseMultiplicativeDropout_updateGradInput
)(
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
noise
,
float
alpha
)
{
if
(
d_input_features
!=
d_output_features
)
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
d_output_features
);
auto
nActive
=
input_features
->
size
[
0
];
auto
nPlanes
=
input_features
->
size
[
1
];
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
12
,
64
)
SPARSECONVNET_FOO
(
8
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
#undef SPARSECONVNET_FOO
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
uInt
nActive
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
uInt
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
uInt
i
=
row
*
input_stride
+
plane
;
uInt
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
uInt
NTX
,
uInt
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
uInt
nActive
,
uInt
nPlanes
,
uInt
input_stride
,
uInt
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
uInt
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
uInt
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
uInt
i
=
row
*
input_stride
+
plane
;
uInt
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#endif
/* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
0 → 100644
View file @
f9552033
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE_
#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
#else
#include "Convolution.h"
#include "RuleBookIterator.h"
#include <algorithm>
#include <cstring>
extern
"C"
double
scn_DR_
(
Convolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
Convolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
outputSize
,
THLongTensor
*
filterSize
,
THLongTensor
*
filterStride
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
uInt
nActive
=
_m
.
getNActive
(
outputSize
);
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
extern
"C"
double
scn_DR_
(
ValidConvolution_updateOutput
)(
THLongTensor
*
inputSize
,
THLongTensor
*
filterSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
output_features
,
THCTensor
*
weight
,
THCTensor
*
bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getValidRuleBook
(
inputSize
,
filterSize
,
true
);
uInt
nActive
=
input_features
->
size
[
0
];
THCTensor_
(
resize2d
)(
state
,
output_features
,
nActive
,
weight
->
size
[
1
]);
if
(
not
bias
)
THCTensor_
(
zero
)(
state
,
output_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
oF
=
THCTensor_
(
data
)(
state
,
output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
double
flops
=
0
;
if
(
bias
)
{
auto
b
=
THCTensor_
(
data
)(
state
,
bias
);
for
(
uInt
i
=
0
;
i
<
op
;
i
+=
32
)
{
uInt
blockDim
=
min
(
32L
,
op
-
i
);
uInt
gridDim
=
min
(
4096
,
nActive
);
Convolution_fp_bias
<<
<
gridDim
,
blockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
oF
+
i
,
b
+
i
,
op
,
op
,
nActive
);
}
}
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_forward2
<
real
>
(
iF
,
oF
,
w
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
return
flops
;
}
extern
"C"
void
scn_DR_
(
ValidConvolution_backward
)(
THLongTensor
*
inputSize
,
THLongTensor
*
filterSize
,
void
**
m
,
THCTensor
*
input_features
,
THCTensor
*
d_input_features
,
THCTensor
*
d_output_features
,
THCTensor
*
weight
,
THCTensor
*
d_weight
,
THCTensor
*
d_bias
,
long
filterVolume
,
THCITensor
*
rulesBuffer
)
{
SCN_INITIALIZE_AND_REFERENCE
(
Metadata
<
Dimension
>
,
m
)
auto
_rules
=
_m
.
getValidRuleBook
(
inputSize
,
filterSize
,
true
);
uInt
nActive
=
input_features
->
size
[
0
];
THCTensor_
(
resizeAs
)(
state
,
d_input_features
,
input_features
);
THCTensor_
(
zero
)(
state
,
d_input_features
);
auto
iF
=
THCTensor_
(
data
)(
state
,
input_features
);
auto
diF
=
THCTensor_
(
data
)(
state
,
d_input_features
);
auto
doF
=
THCTensor_
(
data
)(
state
,
d_output_features
);
auto
ip
=
input_features
->
size
[
1
];
auto
op
=
d_output_features
->
size
[
1
];
auto
w
=
THCTensor_
(
data
)(
state
,
weight
);
auto
dw
=
THCTensor_
(
data
)(
state
,
d_weight
);
uInt
c
=
ip
*
op
;
RULEBOOKITERATOR
(
dConvolution_backward_dW2
<
real
>
(
iF
,
diF
,
doF
,
w
,
dw
,
rbB
,
nHotB
,
ip
,
ip
,
op
,
op
,
THCState_getCurrentStream
(
state
));
,
w
+=
c
;
dw
+=
c
;)
if
(
d_bias
)
{
auto
db
=
THCTensor_
(
data
)(
state
,
d_bias
);
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
,
THCState_getCurrentStream
(
state
));
}
}
#endif
PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
0 → 100644
View file @
f9552033
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment