Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
de3743f6
"lib/vscode:/vscode.git/clone" did not exist on "6ce428a5386a5bf5c775eb9e6124bcfd132c2b64"
Commit
de3743f6
authored
Jul 13, 2018
by
Benjamin Thomas Graham
Browse files
Factor out CUDA code
parent
f0407b36
Changes
96
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1157 additions
and
1098 deletions
+1157
-1098
sparseconvnet/SCN/CPU/MaxPooling.cpp
sparseconvnet/SCN/CPU/MaxPooling.cpp
+25
-1
sparseconvnet/SCN/CPU/MaxPooling.h
sparseconvnet/SCN/CPU/MaxPooling.h
+0
-36
sparseconvnet/SCN/CPU/SparseToDense.cpp
sparseconvnet/SCN/CPU/SparseToDense.cpp
+24
-1
sparseconvnet/SCN/CPU/SparseToDense.h
sparseconvnet/SCN/CPU/SparseToDense.h
+0
-35
sparseconvnet/SCN/CPU/UnPooling.cpp
sparseconvnet/SCN/CPU/UnPooling.cpp
+22
-1
sparseconvnet/SCN/CPU/UnPooling.h
sparseconvnet/SCN/CPU/UnPooling.h
+0
-33
sparseconvnet/SCN/CUDA/ActivePooling.cpp
sparseconvnet/SCN/CUDA/ActivePooling.cpp
+54
-0
sparseconvnet/SCN/CUDA/ActivePooling.cu
sparseconvnet/SCN/CUDA/ActivePooling.cu
+46
-37
sparseconvnet/SCN/CUDA/ActivePooling.h
sparseconvnet/SCN/CUDA/ActivePooling.h
+0
-55
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
+63
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+423
-35
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
+0
-441
sparseconvnet/SCN/CUDA/AveragePooling.cpp
sparseconvnet/SCN/CUDA/AveragePooling.cpp
+61
-0
sparseconvnet/SCN/CUDA/AveragePooling.cu
sparseconvnet/SCN/CUDA/AveragePooling.cu
+62
-40
sparseconvnet/SCN/CUDA/AveragePooling.h
sparseconvnet/SCN/CUDA/AveragePooling.h
+0
-76
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
+71
-0
sparseconvnet/SCN/CUDA/BatchNormalization.cu
sparseconvnet/SCN/CUDA/BatchNormalization.cu
+213
-69
sparseconvnet/SCN/CUDA/BatchNormalization.h
sparseconvnet/SCN/CUDA/BatchNormalization.h
+0
-210
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
+37
-0
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+56
-28
No files found.
sparseconvnet/SCN/CPU/MaxPooling.cpp
View file @
de3743f6
...
@@ -4,7 +4,31 @@
...
@@ -4,7 +4,31 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateOutput
(
void
cpu_MaxPooling_updateOutput
(
...
...
sparseconvnet/SCN/CPU/MaxPooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_MAXPOOLING_H */
sparseconvnet/SCN/CPU/SparseToDense.cpp
View file @
de3743f6
...
@@ -4,7 +4,30 @@
...
@@ -4,7 +4,30 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateOutput
(
void
cpu_SparseToDense_updateOutput
(
...
...
sparseconvnet/SCN/CPU/SparseToDense.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
#endif
/* CPU_SPARSETODENSE_H */
sparseconvnet/SCN/CPU/UnPooling.cpp
View file @
de3743f6
...
@@ -4,7 +4,28 @@
...
@@ -4,7 +4,28 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "UnPooling.h"
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateOutput
(
void
cpu_UnPooling_updateOutput
(
...
...
sparseconvnet/SCN/CPU/UnPooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_UNPOOLING_H */
sparseconvnet/SCN/CUDA/ActivePooling.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
output_features
.
resize_
({
batchSize
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
,
batchSize
,
maxActive
,
nPlanes
,
&
_rules
[
0
][
0
],
average
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
,
batchSize
,
maxActive
,
nPlanes
,
&
_rules
[
0
][
0
],
average
);
}
sparseconvnet/SCN/CUDA/ActivePooling.cu
View file @
de3743f6
...
@@ -4,64 +4,73 @@
...
@@ -4,64 +4,73 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "ActivePooling.h"
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
template
<
typename
T
,
Int
Dimension
>
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
void
cuda_ActivePooling_updateOutput
(
bool
average
)
{
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
/*cuda float*/
at
::
Tensor
input_features
,
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
Int
nPlanes
=
input_features
.
size
(
1
);
while
(
nActive
--
>
0
)
{
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
Int
batchSize
=
_rules
[
1
][
0
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
Int
maxActive
=
_rules
[
1
][
1
];
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
output_features
.
resize_
({
batchSize
,
nPlanes
});
}
output_features
.
zero_
();
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
assert
(
rowBatchSize
>
0
);
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_
rules
[
0
][
o
*
(
maxActive
+
1
)
]
,
cudaMemcpy
(
rb
,
rules
+
o
*
(
maxActive
+
1
),
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
ActivePooling_fp
<
T
><<<
batchSize_
,
kernelBlockDim
>>>
(
nPlanes
,
rb
,
average
);
input_features
,
output_features
+
0
*
nPlanes
,
maxActive
,
nPlanes
,
rules
,
average
);
}
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
assert
(
rowBatchSize
>
0
);
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_
rules
[
0
][
o
*
(
maxActive
+
1
)
]
,
cudaMemcpy
(
rb
,
rules
+
o
*
(
maxActive
+
1
),
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
ActivePooling_bp
<
T
><<<
batchSize_
,
kernelBlockDim
>>>
(
nPlanes
,
rb
,
average
);
d_input_features
,
d_output_features
+
o
*
nPlanes
,
maxActive
,
nPlanes
,
rules
,
average
);
}
}
}
}
sparseconvnet/SCN/CUDA/ActivePooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_ACTIVEPOOLING_H
#define CUDA_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_fp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_bp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/* CUDA_ActivePOOLING_H */
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
);
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
);
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
)
{
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
dAffineReluTrivialConvolution_forward
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
output_features
.
size
(
1
),
input_features
.
size
(
0
));
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
output_features
.
size
(
1
);
}
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
d_affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
d_affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
View file @
de3743f6
...
@@ -4,44 +4,432 @@
...
@@ -4,44 +4,432 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h"
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template
<
typename
T
>
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
/*cuda float*/
at
::
Tensor
input_features
,
T
*
affineWeight
,
T
*
affineBias
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
convWeight
,
Int
input_nPlanes
,
/*cuda float*/
at
::
Tensor
affineWeight
,
Int
input_stride
,
Int
output_nPlanes
,
/*cuda float*/
at
::
Tensor
affineBias
,
Int
output_stride
,
Int
nActive
)
{
/*cuda float*/
at
::
Tensor
convWeight
)
{
FOO
(
T
,
64
,
16
)
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
FOO
(
T
,
32
,
8
)
dAffineReluTrivialConvolution_forward
<
T
>
(
FOO
(
T
,
16
,
4
)
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
FOO
(
T
,
8
,
2
)
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
assert
(
false
);
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
}
output_features
.
size
(
1
),
input_features
.
size
(
0
));
template
<
>
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
void
dAffineReluTrivialConvolution_forward
<
double
>
(
output_features
.
size
(
1
);
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
void
dAffineReluTrivialConvolution_backward_dW
(
/*cuda float*/
at
::
Tensor
input_features
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
/*cuda float*/
at
::
Tensor
d_input_features
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
/*cuda float*/
at
::
Tensor
d_output_features
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
/*cuda float*/
at
::
Tensor
affineWeight
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
/*cuda float*/
at
::
Tensor
d_affineWeight
,
FOO
(
T
,
32
,
8
)
/*cuda float*/
at
::
Tensor
affineBias
,
FOO
(
T
,
16
,
4
)
/*cuda float*/
at
::
Tensor
d_affineBias
,
FOO
(
T
,
8
,
2
)
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
}
#undef FOO
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dAffineReluTrivialConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
}
#undef FOO
#endif
sparseconvnet/SCN/CUDA/AveragePooling.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
template
<
typename
T
>
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
cuda_AveragePooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
_rules
,
_rules
.
size
());
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
_rules
,
_rules
.
size
());
}
sparseconvnet/SCN/CUDA/AveragePooling.cu
View file @
de3743f6
...
@@ -4,51 +4,73 @@
...
@@ -4,51 +4,73 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
// NTX must be >=2 so r is filled properly
void
cuda_AveragePooling_updateOutput
(
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
/*long*/
at
::
Tensor
poolSize
,
Int
nPlanes
,
Int
input_stride
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
/*cuda float*/
at
::
Tensor
input_features
,
T
alpha
)
{
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
{
auto
_rules
=
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
Int
nActive
=
m
.
getNActive
(
outputSize
);
r
[
i
]
=
rules
[
2
*
n
+
i
];
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
}
output_features
.
zero_
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
template
<
typename
T
>
auto
oF
=
output_features
.
data
<
T
>
();
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
RULEBOOKITERATOR
(
cuda_AveragePooling_ForwardPass
<
T
>
(
Int
nPlanes
,
Int
input_stride
,
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
Int
output_stride
,
RuleBook
_rules
,
output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
Int
filterVolume
)
{
RULEBOOKITERATOR
((
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rbB
,
nHotB
,
1.0
/
filterVolume
));
,
)
,
)
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
>
void
cuda_AveragePooling_updateGradInput
(
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
Int
nPlanes
,
Int
input_stride
,
/*long*/
at
::
Tensor
poolSize
,
Int
output_stride
,
RuleBook
_rules
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
Int
filterVolume
)
{
/*cuda float*/
at
::
Tensor
input_features
,
RULEBOOKITERATOR
((
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
/*cuda float*/
at
::
Tensor
d_input_features
,
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
rbB
,
nHotB
,
1.0
/
filterVolume
));
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
,
)
}
}
sparseconvnet/SCN/CUDA/AveragePooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AVERAGEPOOLING_H
#define CUDA_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/* CUDA_AVERAGEPOOLING_H */
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
bn_f
(
T
*
iF
,
T
*
oF
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
);
template
<
typename
T
>
void
bn_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
);
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
bn_f
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
.
data
<
T
>
(),
saveInvStd
.
data
<
T
>
(),
runningMean
.
data
<
T
>
(),
runningVar
.
data
<
T
>
(),
OptionalTensorData
<
T
>
(
weight
),
OptionalTensorData
<
T
>
(
bias
),
eps
,
momentum
,
train
,
leakiness
);
}
}
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
bn_b
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
.
data
<
T
>
(),
saveInvStd
.
data
<
T
>
(),
runningMean
.
data
<
T
>
(),
runningVar
.
data
<
T
>
(),
OptionalTensorData
<
T
>
(
weight
),
OptionalTensorData
<
T
>
(
bias
),
OptionalTensorData
<
T
>
(
d_weight
),
OptionalTensorData
<
T
>
(
d_bias
),
leakiness
);
}
}
sparseconvnet/SCN/CUDA/BatchNormalization.cu
View file @
de3743f6
...
@@ -4,56 +4,215 @@
...
@@ -4,56 +4,215 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#define BN_F_MACRO(N) \
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
iF, oF, nPlanes, input_stride, output_stride, nActive, saveMean, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd, runningMean, runningVar, weight, bias, eps, momentum, \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
train, leakiness); \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
void
bn_f
(
T
*
iF
,
T
*
oF
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
/*cuda float*/
at
::
Tensor
input_features
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
/*cuda float*/
at
::
Tensor
saveMean
,
T
leakiness
)
{
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
BN_F_MACRO
(
16
)
/*cuda float*/
at
::
Tensor
runningVar
,
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
template
<
typename
T
>
void
cuda_BatchNormalizationInTensor_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
}
#undef BN_F_MACRO
#undef BN_F_MACRO
...
@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
...
@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
#define BN_B_MACRO(N) \
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
input_features, d_input_features, output_features, d_output_features, \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness); \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
void
bn_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
/*cuda float*/
at
::
Tensor
input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
/*cuda float*/
at
::
Tensor
d_input_features
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
/*cuda float*/
at
::
Tensor
d_output_features
,
T
*
d_bias
,
T
leakiness
)
{
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
BN_B_MACRO
(
16
)
/*cuda float*/
at
::
Tensor
runningMean
,
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
}
}
#undef BN_B_MACRO
sparseconvnet/SCN/CUDA/BatchNormalization.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHNORMALIZATION_H
#define CUDA_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#undef NTX
#undef NTY
#endif
/* CUDA_BATCHNORMALIZATION_H */
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
bmd_f
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
);
template
<
typename
T
>
void
bmd_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
);
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
T
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
bmd_f
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
noise
.
data
<
T
>
(),
nActive
,
nPlanes
,
alpha
);
}
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
T
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
bmd_b
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
noise
.
data
<
T
>
(),
nActive
,
nPlanes
,
alpha
);
}
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
View file @
de3743f6
...
@@ -4,28 +4,63 @@
...
@@ -4,28 +4,63 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "BatchwiseMultiplicativeDropout.h"
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp< \
BatchwiseMultiplicativeDropout_fp<T, NTX, NTY><<< \
T, NTX, \
dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features, output_features, noise, nActive, nPlanes, nPlanes, \
input_features.data<T>(), output_features.data<T>(), \
nPlanes, alpha); \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
void
bmd_f
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
/*cuda float*/
at
::
Tensor
input_features
,
Int
nPlanes
,
T
alpha
)
{
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
16
,
64
)
...
@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
...
@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp< \
BatchwiseMultiplicativeDropout_bp<T, NTX, NTY><<< \
T, NTX, \
dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features, d_input_features, d_output_features, noise, nActive, \
input_features.data<T>(), d_input_features.data<T>(), \
nPlanes, nPlanes, nPlanes, alpha); \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
template
<
typename
T
>
void
bmd_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
)
{
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
16
,
64
)
...
@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
...
@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
}
#undef SPARSECONVNET_FOO
#undef SPARSECONVNET_FOO
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment