Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
de3743f6
Commit
de3743f6
authored
Jul 13, 2018
by
Benjamin Thomas Graham
Browse files
Factor out CUDA code
parent
f0407b36
Changes
96
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1157 additions
and
1098 deletions
+1157
-1098
sparseconvnet/SCN/CPU/MaxPooling.cpp
sparseconvnet/SCN/CPU/MaxPooling.cpp
+25
-1
sparseconvnet/SCN/CPU/MaxPooling.h
sparseconvnet/SCN/CPU/MaxPooling.h
+0
-36
sparseconvnet/SCN/CPU/SparseToDense.cpp
sparseconvnet/SCN/CPU/SparseToDense.cpp
+24
-1
sparseconvnet/SCN/CPU/SparseToDense.h
sparseconvnet/SCN/CPU/SparseToDense.h
+0
-35
sparseconvnet/SCN/CPU/UnPooling.cpp
sparseconvnet/SCN/CPU/UnPooling.cpp
+22
-1
sparseconvnet/SCN/CPU/UnPooling.h
sparseconvnet/SCN/CPU/UnPooling.h
+0
-33
sparseconvnet/SCN/CUDA/ActivePooling.cpp
sparseconvnet/SCN/CUDA/ActivePooling.cpp
+54
-0
sparseconvnet/SCN/CUDA/ActivePooling.cu
sparseconvnet/SCN/CUDA/ActivePooling.cu
+46
-37
sparseconvnet/SCN/CUDA/ActivePooling.h
sparseconvnet/SCN/CUDA/ActivePooling.h
+0
-55
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
+63
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+423
-35
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
+0
-441
sparseconvnet/SCN/CUDA/AveragePooling.cpp
sparseconvnet/SCN/CUDA/AveragePooling.cpp
+61
-0
sparseconvnet/SCN/CUDA/AveragePooling.cu
sparseconvnet/SCN/CUDA/AveragePooling.cu
+62
-40
sparseconvnet/SCN/CUDA/AveragePooling.h
sparseconvnet/SCN/CUDA/AveragePooling.h
+0
-76
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
+71
-0
sparseconvnet/SCN/CUDA/BatchNormalization.cu
sparseconvnet/SCN/CUDA/BatchNormalization.cu
+213
-69
sparseconvnet/SCN/CUDA/BatchNormalization.h
sparseconvnet/SCN/CUDA/BatchNormalization.h
+0
-210
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
+37
-0
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+56
-28
No files found.
sparseconvnet/SCN/CPU/MaxPooling.cpp
View file @
de3743f6
...
@@ -4,7 +4,31 @@
...
@@ -4,7 +4,31 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateOutput
(
void
cpu_MaxPooling_updateOutput
(
...
...
sparseconvnet/SCN/CPU/MaxPooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_MAXPOOLING_H */
sparseconvnet/SCN/CPU/SparseToDense.cpp
View file @
de3743f6
...
@@ -4,7 +4,30 @@
...
@@ -4,7 +4,30 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateOutput
(
void
cpu_SparseToDense_updateOutput
(
...
...
sparseconvnet/SCN/CPU/SparseToDense.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
#endif
/* CPU_SPARSETODENSE_H */
sparseconvnet/SCN/CPU/UnPooling.cpp
View file @
de3743f6
...
@@ -4,7 +4,28 @@
...
@@ -4,7 +4,28 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "UnPooling.h"
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateOutput
(
void
cpu_UnPooling_updateOutput
(
...
...
sparseconvnet/SCN/CPU/UnPooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
#endif
/* CPU_UNPOOLING_H */
sparseconvnet/SCN/CUDA/ActivePooling.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
output_features
.
resize_
({
batchSize
,
nPlanes
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
,
batchSize
,
maxActive
,
nPlanes
,
&
_rules
[
0
][
0
],
average
);
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
,
batchSize
,
maxActive
,
nPlanes
,
&
_rules
[
0
][
0
],
average
);
}
sparseconvnet/SCN/CUDA/ActivePooling.cu
View file @
de3743f6
...
@@ -4,64 +4,73 @@
...
@@ -4,64 +4,73 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "ActivePooling.h"
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
template
<
typename
T
,
Int
Dimension
>
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
void
cuda_ActivePooling_updateOutput
(
bool
average
)
{
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
/*cuda float*/
at
::
Tensor
input_features
,
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
Int
nPlanes
=
input_features
.
size
(
1
);
while
(
nActive
--
>
0
)
{
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
Int
batchSize
=
_rules
[
1
][
0
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
Int
maxActive
=
_rules
[
1
][
1
];
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
output_features
.
resize_
({
batchSize
,
nPlanes
});
}
output_features
.
zero_
();
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
assert
(
rowBatchSize
>
0
);
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_
rules
[
0
][
o
*
(
maxActive
+
1
)
]
,
cudaMemcpy
(
rb
,
rules
+
o
*
(
maxActive
+
1
),
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
ActivePooling_fp
<
T
><<<
batchSize_
,
kernelBlockDim
>>>
(
nPlanes
,
rb
,
average
);
input_features
,
output_features
+
0
*
nPlanes
,
maxActive
,
nPlanes
,
rules
,
average
);
}
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
assert
(
rowBatchSize
>
0
);
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_
rules
[
0
][
o
*
(
maxActive
+
1
)
]
,
cudaMemcpy
(
rb
,
rules
+
o
*
(
maxActive
+
1
),
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
ActivePooling_bp
<
T
><<<
batchSize_
,
kernelBlockDim
>>>
(
nPlanes
,
rb
,
average
);
d_input_features
,
d_output_features
+
o
*
nPlanes
,
maxActive
,
nPlanes
,
rules
,
average
);
}
}
}
}
sparseconvnet/SCN/CUDA/ActivePooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_ACTIVEPOOLING_H
#define CUDA_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_fp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_bp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/* CUDA_ActivePOOLING_H */
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
);
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
);
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
)
{
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
dAffineReluTrivialConvolution_forward
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
output_features
.
size
(
1
),
input_features
.
size
(
0
));
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
output_features
.
size
(
1
);
}
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
d_affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
d_affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
View file @
de3743f6
...
@@ -4,44 +4,432 @@
...
@@ -4,44 +4,432 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h"
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template
<
typename
T
>
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
/*cuda float*/
at
::
Tensor
input_features
,
T
*
affineWeight
,
T
*
affineBias
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
convWeight
,
Int
input_nPlanes
,
/*cuda float*/
at
::
Tensor
affineWeight
,
Int
input_stride
,
Int
output_nPlanes
,
/*cuda float*/
at
::
Tensor
affineBias
,
Int
output_stride
,
Int
nActive
)
{
/*cuda float*/
at
::
Tensor
convWeight
)
{
FOO
(
T
,
64
,
16
)
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
FOO
(
T
,
32
,
8
)
dAffineReluTrivialConvolution_forward
<
T
>
(
FOO
(
T
,
16
,
4
)
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
FOO
(
T
,
8
,
2
)
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
assert
(
false
);
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
}
output_features
.
size
(
1
),
input_features
.
size
(
0
));
template
<
>
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
void
dAffineReluTrivialConvolution_forward
<
double
>
(
output_features
.
size
(
1
);
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
void
dAffineReluTrivialConvolution_backward_dW
(
/*cuda float*/
at
::
Tensor
input_features
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
/*cuda float*/
at
::
Tensor
d_input_features
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
/*cuda float*/
at
::
Tensor
d_output_features
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
/*cuda float*/
at
::
Tensor
affineWeight
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
/*cuda float*/
at
::
Tensor
d_affineWeight
,
FOO
(
T
,
32
,
8
)
/*cuda float*/
at
::
Tensor
affineBias
,
FOO
(
T
,
16
,
4
)
/*cuda float*/
at
::
Tensor
d_affineBias
,
FOO
(
T
,
8
,
2
)
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
}
#undef FOO
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
T
O
[
V
];
__shared__
T
I
[
K
][
K
];
// zz try K+1 trick A+B+backwards
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
// Read affineWeight, affineBias and convWeight
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i
>
0
)
?
i
:
0
;
if
(
m
==
0
)
{
O
[
v
]
=
0
;
}
else
{
O
[
v
]
=
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
}
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
CW
[
k
][
tx
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
outFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
affineWeight
+=
K
;
affineBias
+=
K
;
convWeight
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dAffineReluTrivialConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
dConvWeight
+=
m
*
K
*
output_nPlanes
;
affineWeight
+=
m
*
K
;
dAffineWeight
+=
m
*
K
;
affineBias
+=
m
*
K
;
dAffineBias
+=
m
*
K
;
T
dI
[
V
];
T
dCW
[
V
];
T
i
[
V
];
T
dAW
=
0
;
T
dAB
=
0
;
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
if
(
ty
[
0
]
==
0
)
{
AW
[
tx
]
=
affineWeight
[
tx
];
AB
[
tx
]
=
affineBias
[
tx
];
}
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dCW
[
v
]
=
0
;
}
__syncthreads
();
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
T
i_
=
inFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
];
i
[
v
]
=
i_
;
i_
=
i_
*
AW
[
tx
]
+
AB
[
tx
];
I
[
ty
[
v
]][
tx
]
=
(
i_
>
0
)
?
i_
:
0
;
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[(
s
+
ty
[
v
])
*
output_stride
+
tx
];
}
else
{
i
[
v
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
CW
[
tx
][
k
];
dCW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nActive
)
{
dI
[
v
]
=
(
I
[
ty
[
v
]][
tx
]
>
0
)
?
dI
[
v
]
:
0
;
dAW
+=
i
[
v
]
*
dI
[
v
];
dAB
+=
dI
[
v
];
if
(
additiveGrad
)
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
+=
dI
[
v
];
else
dInFeatures
[(
s
+
ty
[
v
])
*
input_stride
+
tx
]
=
dI
[
v
];
}
__syncthreads
();
}
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dConvWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dCW
[
v
]);
convWeight
+=
K
;
dConvWeight
+=
K
;
dOutFeatures
+=
K
;
__syncthreads
();
}
atomicAdd
(
&
dAffineWeight
[
tx
],
dAW
);
atomicAdd
(
&
dAffineBias
[
tx
],
dAB
);
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
template
<
typename
T
>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
}
#undef FOO
#endif
sparseconvnet/SCN/CUDA/AveragePooling.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
template
<
typename
T
>
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
cuda_AveragePooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
_rules
,
_rules
.
size
());
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
_rules
,
_rules
.
size
());
}
sparseconvnet/SCN/CUDA/AveragePooling.cu
View file @
de3743f6
...
@@ -4,51 +4,73 @@
...
@@ -4,51 +4,73 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
// NTX must be >=2 so r is filled properly
void
cuda_AveragePooling_updateOutput
(
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
/*long*/
at
::
Tensor
poolSize
,
Int
nPlanes
,
Int
input_stride
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
/*cuda float*/
at
::
Tensor
input_features
,
T
alpha
)
{
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
{
auto
_rules
=
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
Int
nActive
=
m
.
getNActive
(
outputSize
);
r
[
i
]
=
rules
[
2
*
n
+
i
];
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
}
output_features
.
zero_
();
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
template
<
typename
T
>
auto
oF
=
output_features
.
data
<
T
>
();
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
RULEBOOKITERATOR
(
cuda_AveragePooling_ForwardPass
<
T
>
(
Int
nPlanes
,
Int
input_stride
,
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
Int
output_stride
,
RuleBook
_rules
,
output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
Int
filterVolume
)
{
RULEBOOKITERATOR
((
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rbB
,
nHotB
,
1.0
/
filterVolume
));
,
)
,
)
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
Dimension
>
template
<
typename
T
>
void
cuda_AveragePooling_updateGradInput
(
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
Int
nPlanes
,
Int
input_stride
,
/*long*/
at
::
Tensor
poolSize
,
Int
output_stride
,
RuleBook
_rules
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
Int
filterVolume
)
{
/*cuda float*/
at
::
Tensor
input_features
,
RULEBOOKITERATOR
((
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
/*cuda float*/
at
::
Tensor
d_input_features
,
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
rbB
,
nHotB
,
1.0
/
filterVolume
));
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
,
)
}
}
sparseconvnet/SCN/CUDA/AveragePooling.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_AVERAGEPOOLING_H
#define CUDA_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
nHot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
}
}
template
<
typename
T
>
void
cuda_AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/* CUDA_AVERAGEPOOLING_H */
sparseconvnet/SCN/CUDA/BatchNormalization.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
bn_f
(
T
*
iF
,
T
*
oF
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
);
template
<
typename
T
>
void
bn_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
);
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
bn_f
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
.
data
<
T
>
(),
saveInvStd
.
data
<
T
>
(),
runningMean
.
data
<
T
>
(),
runningVar
.
data
<
T
>
(),
OptionalTensorData
<
T
>
(
weight
),
OptionalTensorData
<
T
>
(
bias
),
eps
,
momentum
,
train
,
leakiness
);
}
}
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
bn_b
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
.
data
<
T
>
(),
saveInvStd
.
data
<
T
>
(),
runningMean
.
data
<
T
>
(),
runningVar
.
data
<
T
>
(),
OptionalTensorData
<
T
>
(
weight
),
OptionalTensorData
<
T
>
(
bias
),
OptionalTensorData
<
T
>
(
d_weight
),
OptionalTensorData
<
T
>
(
d_bias
),
leakiness
);
}
}
sparseconvnet/SCN/CUDA/BatchNormalization.cu
View file @
de3743f6
...
@@ -4,56 +4,215 @@
...
@@ -4,56 +4,215 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#define BN_F_MACRO(N) \
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
iF, oF, nPlanes, input_stride, output_stride, nActive, saveMean, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd, runningMean, runningVar, weight, bias, eps, momentum, \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
train, leakiness); \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchNormalization_updateOutput
(
void
bn_f
(
T
*
iF
,
T
*
oF
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
/*cuda float*/
at
::
Tensor
input_features
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
/*cuda float*/
at
::
Tensor
saveMean
,
T
leakiness
)
{
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
BN_F_MACRO
(
16
)
/*cuda float*/
at
::
Tensor
runningVar
,
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
output_features
.
resize_as_
(
input_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
template
<
typename
T
>
void
cuda_BatchNormalizationInTensor_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
/*cuda float*/
at
::
Tensor
runningMean
,
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_F_MACRO
(
16
)
else
BN_F_MACRO
(
12
)
else
BN_F_MACRO
(
8
)
else
BN_F_MACRO
(
4
)
else
BN_F_MACRO
(
1
)
}
}
}
#undef BN_F_MACRO
#undef BN_F_MACRO
...
@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
...
@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
#define BN_B_MACRO(N) \
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
input_features, d_input_features, output_features, d_output_features, \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness); \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchNormalization_backward
(
void
bn_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
/*cuda float*/
at
::
Tensor
input_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
/*cuda float*/
at
::
Tensor
d_input_features
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
/*cuda float*/
at
::
Tensor
output_features
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
/*cuda float*/
at
::
Tensor
d_output_features
,
T
*
d_bias
,
T
leakiness
)
{
/*cuda float*/
at
::
Tensor
saveMean
,
/*cuda float*/
at
::
Tensor
saveInvStd
,
BN_B_MACRO
(
16
)
/*cuda float*/
at
::
Tensor
runningMean
,
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
/*cuda float*/
at
::
Tensor
runningVar
,
/*cuda float*/
at
::
Tensor
weight
,
/*cuda float*/
at
::
Tensor
bias
,
/*cuda float*/
at
::
Tensor
d_weight
,
/*cuda float*/
at
::
Tensor
d_bias
,
T
leakiness
)
{
d_input_features
.
resize_as_
(
d_output_features
);
if
(
input_features
.
ndimension
()
==
2
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
input_stride
=
input_features
.
stride
(
0
);
auto
output_stride
=
output_features
.
stride
(
0
);
BN_B_MACRO
(
16
)
else
BN_B_MACRO
(
12
)
else
BN_B_MACRO
(
8
)
else
BN_B_MACRO
(
4
)
else
BN_B_MACRO
(
1
)
}
}
}
#undef BN_B_MACRO
sparseconvnet/SCN/CUDA/BatchNormalization.h
deleted
100644 → 0
View file @
f0407b36
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_BATCHNORMALIZATION_H
#define CUDA_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
// values to act on a subset of columns, i.e. an inplace DenseNet blocks
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_train
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
c
=
plane
+
threadIdx
.
y
*
input_stride
;
row
<
nActive
;
row
+=
NTY
,
c
+=
input_stride
*
NTY
)
{
T
i
=
input_features
[
c
];
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
i
*
i
;
}
__syncthreads
();
T
_saveMean
=
0
;
T
_saveInvStd
=
0
;
for
(
Int
row
=
0
;
row
<
NTY
;
row
++
)
{
_saveMean
+=
t
[
row
][
threadIdx
.
x
];
_saveInvStd
+=
t2
[
row
][
threadIdx
.
x
];
}
_saveMean
/=
nActive
;
_saveInvStd
=
_saveInvStd
-
_saveMean
*
_saveMean
*
nActive
;
if
(
threadIdx
.
y
==
0
)
{
saveMean
[
plane
]
=
_saveMean
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
_saveMean
;
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
_saveInvStd
/
(
nActive
-
1
);
}
_saveInvStd
=
pow
(
_saveInvStd
/
nActive
+
eps
,
-
0.5
);
if
(
threadIdx
.
y
==
0
)
saveInvStd
[
plane
]
=
_saveInvStd
;
__syncthreads
();
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
_saveInvStd
*
(
weight
?
weight
[
plane
]
:
1
);
t
[
1
][
threadIdx
.
x
]
=
-
_saveMean
*
t
[
0
][
threadIdx
.
x
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
__syncthreads
();
T
W
=
t
[
0
][
threadIdx
.
x
];
T
B
=
t
[
1
][
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
W
*
input_features
[
ci
]
+
B
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_f_test
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
T
leakiness
)
{
__shared__
T
W
[
NTX
];
__shared__
T
B
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
W
[
threadIdx
.
x
]
=
pow
(
runningVar
[
plane
]
+
eps
,
-
0.5
)
*
(
weight
?
weight
[
plane
]
:
1
);
B
[
threadIdx
.
x
]
=
(
bias
?
bias
[
plane
]
:
0
)
-
runningMean
[
plane
]
*
W
[
threadIdx
.
x
];
}
__syncthreads
();
float
w
=
W
[
threadIdx
.
x
],
b
=
B
[
threadIdx
.
x
];
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
out
=
w
*
input_features
[
ci
]
+
b
;
output_features
[
co
]
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
BatchNormalization_f_train
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
else
{
BatchNormalization_f_test
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
eps
,
momentum
,
leakiness
);
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchNormalization_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
__shared__
T
t
[
NTY
][
NTX
];
__shared__
T
t2
[
NTY
][
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
{
t
[
0
][
threadIdx
.
x
]
=
saveMean
[
plane
];
t
[
1
][
threadIdx
.
x
]
=
saveInvStd
[
plane
];
t
[
2
][
threadIdx
.
x
]
=
(
weight
?
weight
[
plane
]
:
1
);
}
__syncthreads
();
T
_saveMean
=
t
[
0
][
threadIdx
.
x
];
T
_saveInvStd
=
t
[
1
][
threadIdx
.
x
];
T
_weight
=
t
[
2
][
threadIdx
.
x
];
__syncthreads
();
t
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
t
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
d
;
t2
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
input_features
[
ci
]
-
_saveMean
)
*
d
;
}
__syncthreads
();
T
gradMean
=
0
;
T
dotp
=
0
;
for
(
int
row
=
0
;
row
<
NTY
;
row
++
)
{
gradMean
+=
t
[
row
][
threadIdx
.
x
];
dotp
+=
t2
[
row
][
threadIdx
.
x
];
}
__syncthreads
();
if
(
d_weight
)
d_weight
[
plane
]
=
dotp
*
_saveInvStd
;
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
;
// sum really
gradMean
/=
nActive
;
T
k
=
dotp
*
_saveInvStd
*
_saveInvStd
/
nActive
;
for
(
Int
row
=
threadIdx
.
y
,
ci
=
plane
+
threadIdx
.
y
*
input_stride
,
co
=
plane
+
threadIdx
.
y
*
output_stride
;
row
<
nActive
;
row
+=
NTY
,
ci
+=
input_stride
*
NTY
,
co
+=
output_stride
*
NTY
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
-
(
input_features
[
ci
]
-
_saveMean
)
*
k
)
*
_saveInvStd
*
_weight
;
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
BatchNormalization_b
<
T
,
NTX
,
NTY
><<<
std
::
min
((
Int
)
16
,
nPlanes
/
NTX
),
dim3
(
NTX
,
NTY
)
>>>
(
input_features
,
d_input_features
,
output_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
nActive
,
saveMean
,
saveInvStd
,
runningMean
,
runningVar
,
weight
,
bias
,
d_weight
,
d_bias
,
leakiness
);
}
#undef NTX
#undef NTY
#endif
/* CUDA_BATCHNORMALIZATION_H */
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
0 → 100644
View file @
de3743f6
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
bmd_f
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
);
template
<
typename
T
>
void
bmd_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
);
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
T
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
bmd_f
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
noise
.
data
<
T
>
(),
nActive
,
nPlanes
,
alpha
);
}
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
T
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
bmd_b
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
noise
.
data
<
T
>
(),
nActive
,
nPlanes
,
alpha
);
}
sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
View file @
de3743f6
...
@@ -4,28 +4,63 @@
...
@@ -4,28 +4,63 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "BatchwiseMultiplicativeDropout.h"
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_fp
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
output_features
[
o
]
=
input_features
[
i
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
BatchwiseMultiplicativeDropout_bp
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
T
alpha
)
{
__shared__
T
nz
[
NTX
];
for
(
Int
plane
=
threadIdx
.
x
+
blockIdx
.
x
*
NTX
;
plane
<
nPlanes
;
plane
+=
gridDim
.
x
*
NTX
)
{
if
(
threadIdx
.
y
==
0
)
nz
[
threadIdx
.
x
]
=
noise
[
plane
];
__syncthreads
();
for
(
Int
row
=
threadIdx
.
y
+
blockIdx
.
y
*
NTY
;
row
<
nActive
;
row
+=
gridDim
.
y
*
NTY
)
{
Int
i
=
row
*
input_stride
+
plane
;
Int
o
=
row
*
output_stride
+
plane
;
d_input_features
[
i
]
=
d_output_features
[
o
]
*
nz
[
threadIdx
.
x
]
*
((
input_features
[
i
]
>
0
)
?
1
:
alpha
);
}
__syncthreads
();
}
}
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp< \
BatchwiseMultiplicativeDropout_fp<T, NTX, NTY><<< \
T, NTX, \
dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features, output_features, noise, nActive, nPlanes, nPlanes, \
input_features.data<T>(), output_features.data<T>(), \
nPlanes, alpha); \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateOutput
(
void
bmd_f
(
T
*
input_features
,
T
*
output_features
,
T
*
noise
,
Int
nActive
,
/*cuda float*/
at
::
Tensor
input_features
,
Int
nPlanes
,
T
alpha
)
{
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
16
,
64
)
...
@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
...
@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
#define SPARSECONVNET_FOO(NTX, NTY) \
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
{ \
if (nPlanes % NTX == 0) { \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp< \
BatchwiseMultiplicativeDropout_bp<T, NTX, NTY><<< \
T, NTX, \
dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features, d_input_features, d_output_features, noise, nActive, \
input_features.data<T>(), d_input_features.data<T>(), \
nPlanes, nPlanes, nPlanes, alpha); \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
void
cuda_BatchwiseMultiplicativeDropout_updateGradInput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
template
<
typename
T
>
void
bmd_b
(
T
*
input_features
,
T
*
d_input_features
,
T
*
d_output_features
,
T
*
noise
,
Int
nActive
,
Int
nPlanes
,
T
alpha
)
{
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
32
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
24
,
32
)
SPARSECONVNET_FOO
(
16
,
64
)
SPARSECONVNET_FOO
(
16
,
64
)
...
@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
...
@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
4
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
SPARSECONVNET_FOO
(
1
,
64
)
}
}
#undef SPARSECONVNET_FOO
#undef SPARSECONVNET_FOO
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment