Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
66986767
Commit
66986767
authored
Jul 31, 2018
by
Benjamin Thomas Graham
Browse files
fixes
parent
edf89af3
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
425 additions
and
411 deletions
+425
-411
sparseconvnet/SCN/CPU/IOLayers.cpp
sparseconvnet/SCN/CPU/IOLayers.cpp
+8
-8
sparseconvnet/SCN/CUDA/Convolution.cpp
sparseconvnet/SCN/CUDA/Convolution.cpp
+6
-7
sparseconvnet/SCN/CUDA/Convolution.cu
sparseconvnet/SCN/CUDA/Convolution.cu
+181
-182
sparseconvnet/SCN/CUDA/Deconvolution.cpp
sparseconvnet/SCN/CUDA/Deconvolution.cpp
+1
-1
sparseconvnet/SCN/CUDA/Deconvolution.cu
sparseconvnet/SCN/CUDA/Deconvolution.cu
+161
-157
sparseconvnet/SCN/cuda.cu
sparseconvnet/SCN/cuda.cu
+55
-55
sparseconvnet/__init__.py
sparseconvnet/__init__.py
+1
-1
sparseconvnet/activations.py
sparseconvnet/activations.py
+12
-0
No files found.
sparseconvnet/SCN/CPU/IOLayers.cpp
View file @
66986767
...
@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
...
@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
for
(
row
=
0
;
row
<
nRows
;
row
++
)
{
for
(
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
(
T
)
1
/
nActive
:
(
T
)
1
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
(
T
)
1
/
nActive
:
(
T
)
1
;
auto
out_f
=
output_features
+
row
*
nPlanes
;
auto
r
=
rules
+
row
*
(
1
+
maxActive
);
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
in_f
=
input_features
+
nPlanes
*
rules
[
i
]
;
auto
in_f
=
input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
out
put_features
[
plane
]
+=
multiplier
*
in_f
[
plane
];
out
_f
[
plane
]
+=
multiplier
*
in_f
[
plane
];
}
}
}
}
output_features
+=
nPlanes
;
rules
+=
1
+
maxActive
;
}
}
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
...
@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
for
(
row
=
0
;
row
<
nRows
;
row
++
)
{
for
(
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
(
T
)
1
/
nActive
:
(
T
)
1
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
(
T
)
1
/
nActive
:
(
T
)
1
;
auto
d_out_f
=
d_output_features
+
row
*
nPlanes
;
auto
r
=
rules
+
row
*
(
1
+
maxActive
);
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
d_in_f
=
d_input_features
+
nPlanes
*
rules
[
i
]
;
auto
d_in_f
=
d_input_features
+
r
[
i
]
*
nPlanes
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_in_f
[
plane
]
+=
multiplier
*
d_out
put_features
[
plane
];
d_in_f
[
plane
]
+=
multiplier
*
d_out
_f
[
plane
];
}
}
d_output_features
+=
nPlanes
;
rules
+=
1
+
maxActive
;
}
}
}
}
...
...
sparseconvnet/SCN/CUDA/Convolution.cpp
View file @
66986767
...
@@ -5,10 +5,9 @@
...
@@ -5,10 +5,9 @@
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
template
<
typename
T
>
void
Convolution_fp_bias
(
T
*
o
f
,
T
*
b
,
Int
nPlanes
,
Int
nActive
Out
);
void
Convolution_fp_bias
(
T
*
o
F
,
T
*
b
,
Int
nPlanes
,
Int
nActive
);
template
<
typename
T
>
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
void
Convolution_bp_bias
(
T
*
d_oF
,
T
*
d_b
,
Int
nPlanes
,
Int
nActive
);
Int
nCOLUMNS
);
template
<
typename
T
>
template
<
typename
T
>
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
RuleBook
_rules
,
Int
input_nPlanes
,
...
@@ -84,7 +83,7 @@ void cuda_Convolution_backward(
...
@@ -84,7 +83,7 @@ void cuda_Convolution_backward(
if
(
d_bias
.
numel
())
{
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
Convolution_bp_bias
(
doF
,
db
,
op
,
nActiveOut
);
}
}
}
}
}
}
...
@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward(
...
@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward(
if
(
d_bias
.
numel
())
{
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActive
);
Convolution_bp_bias
(
doF
,
db
,
op
,
nActive
);
}
}
}
}
}
}
...
@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward(
...
@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward(
if
(
d_bias
.
numel
())
{
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
Convolution_bp_bias
(
doF
,
db
,
op
,
nActiveOut
);
}
}
}
}
}
}
...
@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward(
...
@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward(
if
(
d_bias
.
numel
())
{
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
Convolution_bp_bias
(
doF
,
db
,
op
,
nActiveOut
);
}
}
}
}
}
}
sparseconvnet/SCN/CUDA/Convolution.cu
View file @
66986767
...
@@ -5,10 +5,11 @@
...
@@ -5,10 +5,11 @@
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#include "RuleBookIterator.h"
#define TACC double
template
<
typename
T
>
template
<
typename
T
>
__global__
void
Convolution_fp_bias_
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
__global__
void
Convolution_fp_bias_
(
T
*
output_features
,
T
*
bias
,
Int
nPlanes
,
Int
nActive
)
{
Int
nActive
)
{
Int
n
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
Int
n
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
T
b
=
bias
[
n
];
T
b
=
bias
[
n
];
output_features
+=
n
;
output_features
+=
n
;
...
@@ -21,41 +22,38 @@ template <typename T>
...
@@ -21,41 +22,38 @@ template <typename T>
void
Convolution_fp_bias
(
T
*
oF
,
T
*
b
,
Int
nPlanes
,
Int
nActive
)
{
void
Convolution_fp_bias
(
T
*
oF
,
T
*
b
,
Int
nPlanes
,
Int
nActive
)
{
if
(
nPlanes
/
32
>
0
)
if
(
nPlanes
/
32
>
0
)
Convolution_fp_bias_
<<<
dim3
(
nPlanes
/
32
,
4096
),
32
>>>
(
oF
,
b
,
nPlanes
,
Convolution_fp_bias_
<<<
dim3
(
nPlanes
/
32
,
4096
),
32
>>>
(
oF
,
b
,
nPlanes
,
nActive
);
nActive
);
if
(
nPlanes
%
32
>
0
)
{
if
(
nPlanes
%
32
>
0
)
{
Int
o
=
nPlanes
/
32
*
32
;
Int
o
=
nPlanes
/
32
*
32
;
Convolution_fp_bias_
<<<
dim3
(
1
,
4096
),
nPlanes
-
o
>>>
(
oF
+
o
,
b
+
o
,
nPlanes
,
Convolution_fp_bias_
<<<
dim3
(
1
,
4096
),
nPlanes
-
o
>>>
(
oF
+
o
,
b
+
o
,
nPlanes
,
nActive
);
nActive
);
}
}
}
}
template
<
typename
T
>
template
<
typename
T
>
__global__
void
dColumnSum
(
T
*
matrix
,
T
*
target
,
Int
n
Row
s
,
Int
n
Columns
,
__global__
void
Convolution_bp_bias_
(
T
*
d_oF
,
T
*
d_b
,
Int
n
Plane
s
,
Int
n
Active
)
{
Int
nCOLUMNS
)
{
Int
n
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
Int
i
=
blockIdx
.
x
*
32
+
threadIdx
.
x
;
d_oF
+=
n
;
T
t
=
0
;
T
ACC
t
=
0
;
for
(
Int
j
=
blockIdx
.
y
;
j
<
n
Rows
;
j
+=
32
)
for
(
Int
row
=
blockIdx
.
y
;
row
<
n
Active
;
row
+=
gridDim
.
y
)
t
+=
matrix
[
j
*
nCOLUMNS
+
i
];
t
+=
d_oF
[
row
*
nPlanes
];
atomicAdd
(
&
target
[
i
],
t
);
atomicAdd
(
&
d_b
[
n
],
t
);
}
}
template
<
typename
T
>
template
<
typename
T
>
void
Convolution_bp_bias
(
T
*
matrix
,
T
*
target
,
Int
nRows
,
Int
nColumns
,
void
Convolution_bp_bias
(
T
*
d_oF
,
T
*
d_b
,
Int
nPlanes
,
Int
nActive
)
{
Int
nCOLUMNS
)
{
if
(
nPlanes
/
32
>
0
)
if
(
nColumns
/
32
>
0
)
Convolution_bp_bias_
<<<
dim3
(
nPlanes
/
32
,
32
),
32
>>>
(
d_oF
,
d_b
,
nPlanes
,
nActive
);
dColumnSum
<<<
dim3
(
nColumns
/
32
,
32
),
32
>>>
(
matrix
,
target
,
nRows
,
nColumns
,
if
(
nPlanes
%
32
>
0
)
{
nCOLUMNS
);
Int
o
=
nPlanes
/
32
*
32
;
if
(
nColumns
%
32
>
0
)
{
Convolution_bp_bias_
<<<
dim3
(
1
,
32
),
nPlanes
-
o
>>>
(
d_oF
+
o
,
d_b
+
o
,
nPlanes
,
nActive
);
Int
o
=
nColumns
/
32
*
32
;
dColumnSum
<<<
dim3
(
1
,
32
),
nColumns
-
o
>>>
(
matrix
+
o
,
target
+
o
,
nRows
,
nColumns
,
nCOLUMNS
);
}
}
}
}
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// nHot must be a multiple of K!!
// Input x Weight -> Output
// Input x Weight -> Output
...
@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R0
[
V
];
...
@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R0
[
V
];
...
@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dConvolution_KMxKN_forwardA< \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
output_nPlanes, output_stride); \
if (nHot > o) \
if (nHot > o) \
dConvolution_KMxKN_forwardB< \
dConvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
void
dConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
...
@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
}
}
template
<
>
template
<
>
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
void
dConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
FOO
(
double
,
8
,
2
)
...
@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
...
@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
...
@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
...
@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
Int
N
=
output_nPlanes
/
K
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
...
@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
...
@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A< \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B< \
dConvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
FOO
(
T
,
8
,
2
)
...
@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
...
@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
// Read rules for K input/output pairs
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
...
@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
...
@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// Read rules for K input/output pairs, reset dI[]
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
else
else
I
[
ty
[
v
]][
tx
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
else
else
dO
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
w
+=
K
;
dw
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
dOutFeatures
+=
K
;
...
@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
>
template
<
typename
T
>
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
double
dConvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
Int
c
=
input_nPlanes
*
output_nPlanes
;
double
flops
=
0
;
double
flops
=
0
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
RULEBOOKITERATOR
(
(
dConvolution_KMxKN_forward2
<
(
dConvolution_KMxKN_forward2
<
T
,
K
,
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
output_nPlanes
,
output_stride
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
else
{
}
else
{
RULEBOOKITERATOR
(
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
RULEBOOKITERATOR
(
dConvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
}
return
flops
;
return
flops
;
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dConvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
Int
c
=
input_nPlanes
*
output_nPlanes
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
RULEBOOKITERATOR
(
(
dConvolution_KMxKN_backward_dW2
<
(
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
dw
+=
c
;)
,
w
+=
c
;
dw
+=
c
;)
}
else
{
}
else
{
RULEBOOKITERATOR
(
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
RULEBOOKITERATOR
(
dConvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
,
w
+=
c
;
dw
+=
c
;)
,
w
+=
c
;
dw
+=
c
;)
}
}
}
}
#undef TACC
\ No newline at end of file
sparseconvnet/SCN/CUDA/Deconvolution.cpp
View file @
66986767
...
@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward(
...
@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward(
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
dDeconvolution_backward_dW2
<
T
>
(
iF
,
diF
,
doF
,
w
,
dw
,
_rules
,
ip
,
ip
,
op
,
op
);
if
(
d_bias
.
numel
())
{
if
(
d_bias
.
numel
())
{
auto
db
=
d_bias
.
data
<
T
>
();
auto
db
=
d_bias
.
data
<
T
>
();
Convolution_bp_bias
(
doF
,
db
,
op
,
op
,
nActiveOut
);
Convolution_bp_bias
(
doF
,
db
,
op
,
nActiveOut
);
}
}
}
}
}
}
sparseconvnet/SCN/CUDA/Deconvolution.cu
View file @
66986767
...
@@ -4,11 +4,13 @@
...
@@ -4,11 +4,13 @@
// This source code is licensed under the license found in the
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
// LICENSE file in the root directory of this source tree.
#define TACC double
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dDeconvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// nHot must be a multiple of K!!
// nHot must be a multiple of K!!
// Input x Weight -> Output
// Input x Weight -> Output
...
@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R0
[
V
];
...
@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dDeconvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
outFeatures
+=
n
*
K
;
outFeatures
+=
n
*
K
;
w
+=
n
*
K
;
w
+=
n
*
K
;
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
Int
R0
[
V
];
Int
R0
[
V
];
...
@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dDeconvolution_KMxKN_forwardA< \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
input_nPlanes, input_stride, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
output_nPlanes, output_stride); \
if (nHot > o) \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB< \
dDeconvolution_KMxKN_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
void
dDeconvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
...
@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
}
}
template
<
>
template
<
>
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
void
dDeconvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
double
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
FOO
(
double
,
8
,
2
)
...
@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
...
@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
...
@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
...
@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
w
+=
m
*
K
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R1
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
R0
[
v
]
=
rules
[
2
*
(
s
+
ty
[
v
])
+
1
];
}
}
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
...
@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
...
@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
Int o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A< \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B< \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
output_stride); \
return; \
return; \
} \
} \
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dDeconvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
FOO
(
T
,
8
,
2
)
...
@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
...
@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
dDeconvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// Input x Weight -> Output
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// K is a multiple of V,
...
@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
w
+=
n
*
K
;
w
+=
n
*
K
;
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
Int
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
T
O
[
V
];
T
ACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
Int
R
[
K
*
2
];
__shared__
Int
R
[
K
*
2
];
...
@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
// Read rules for K input/output pairs
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
}
}
__syncthreads
();
__syncthreads
();
// Read input, reset O[]
// Read input, reset O[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
O
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
for
(
int
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
outFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
__syncthreads
();
}
}
w
+=
K
*
output_nPlanes
;
w
+=
K
*
output_nPlanes
;
...
@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
...
@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
template
<
typename
T
,
Int
K
,
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
__global__
void
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
dDeconvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
Int
*
rules
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
nHot
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
// M = gridDim.y == input_nPlanes / K
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
Int
m
=
blockIdx
.
y
;
Int
m
=
blockIdx
.
y
;
...
@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
...
@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
dw
+=
m
*
K
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
;
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
Int
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
T
dI
[
V
];
T
ACC
dI
[
V
];
T
dW
[
V
];
T
ACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
...
@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
...
@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
dW
[
v
]
=
0
;
}
}
...
@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
...
@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
// Read rules for K input/output pairs, reset dI[]
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
2
)
{
if
(
ty
[
v
]
<
2
)
{
int
q
=
ty
[
v
]
*
K
+
tx
;
int
q
=
ty
[
v
]
*
K
+
tx
;
if
(
s
+
q
/
2
<
nHot
)
if
(
s
+
q
/
2
<
nHot
)
R
[
q
]
=
rules
[
2
*
s
+
q
];
R
[
q
]
=
rules
[
2
*
s
+
q
];
}
}
dI
[
v
]
=
0
;
dI
[
v
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
// Read input and dOutput
// Read input and dOutput
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
];
else
else
I
[
ty
[
v
]][
tx
]
=
0
;
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]]
*
output_stride
+
tx
];
else
else
dO
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
}
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
for
(
int
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K
;
k
++
)
for
(
int
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
dInFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
__syncthreads
();
}
}
#pragma unroll
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
for
(
int
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
w
+=
K
;
dw
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
dOutFeatures
+=
K
;
...
@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
...
@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
template
<
typename
T
>
template
<
typename
T
>
double
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
double
dDeconvolution_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
Int
c
=
input_nPlanes
*
output_nPlanes
;
double
flops
=
0
;
double
flops
=
0
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
RULEBOOKITERATOR
(
(
dDeconvolution_KMxKN_forward2
<
(
dDeconvolution_KMxKN_forward2
<
T
,
K
,
T
,
K
,
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
V
><<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
output_nPlanes
,
output_stride
));
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
else
{
}
else
{
RULEBOOKITERATOR
(
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
RULEBOOKITERATOR
(
dDeconvolution_forward
(
inFeatures
,
outFeatures
,
w
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
,
w
+=
c
;
flops
+=
nHotB
*
c
;)
}
}
return
flops
;
return
flops
;
}
}
template
<
typename
T
>
template
<
typename
T
>
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
void
dDeconvolution_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
T
*
w
,
T
*
dw
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
)
{
Int
output_nPlanes
,
Int
output_stride
)
{
Int
c
=
input_nPlanes
*
output_nPlanes
;
Int
c
=
input_nPlanes
*
output_nPlanes
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int
K
=
16
;
const
int
K
=
16
;
const
int
V
=
4
;
const
int
V
=
4
;
RULEBOOKITERATOR
(
RULEBOOKITERATOR
(
(
dDeconvolution_KMxKN_backward_dW2
<
(
dDeconvolution_KMxKN_backward_dW2
<
T
,
K
,
T
,
K
,
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
V
><<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
),
dim3
(
K
,
K
/
V
)
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
));
,
w
+=
c
;
dw
+=
c
;)
,
w
+=
c
;
dw
+=
c
;)
}
else
{
}
else
{
RULEBOOKITERATOR
(
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
RULEBOOKITERATOR
(
dDeconvolution_backward_dW
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
dOutFeatures
,
w
,
dw
,
rbB
,
nHotB
,
input_nPlanes
,
input_stride
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
output_nPlanes
,
output_stride
);
,
w
+=
c
;
dw
+=
c
;)
,
w
+=
c
;
dw
+=
c
;)
}
}
}
}
#undef TACC
\ No newline at end of file
sparseconvnet/SCN/cuda.cu
View file @
66986767
...
@@ -15,15 +15,15 @@
...
@@ -15,15 +15,15 @@
#include "CUDA/UnPooling.cu"
#include "CUDA/UnPooling.cu"
template
void
ActivePooling_ForwardPass
<
float
>(
float
*
input_features
,
template
void
ActivePooling_ForwardPass
<
float
>(
float
*
input_features
,
float
*
output_features
,
float
*
output_features
,
Int
batchSize
,
Int
maxActive
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
bool
average
);
template
void
ActivePooling_BackwardPass
<
float
>(
float
*
d_input_features
,
template
void
ActivePooling_BackwardPass
<
float
>(
float
*
d_input_features
,
float
*
d_output_features
,
float
*
d_output_features
,
Int
batchSize
,
Int
maxActive
,
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
Int
nPlanes
,
Int
*
rules
,
bool
average
);
bool
average
);
template
void
dAffineReluTrivialConvolution_forward
<
float
>(
template
void
dAffineReluTrivialConvolution_forward
<
float
>(
float
*
inFeatures
,
float
*
outFeatures
,
float
*
affineWeight
,
float
*
inFeatures
,
float
*
outFeatures
,
float
*
affineWeight
,
...
@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>(
...
@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>(
float
*
d_input_features
,
float
*
d_output_features
,
Int
nPlanes
,
float
*
d_input_features
,
float
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
,
Int
filterVolume
);
template
void
Convolution_fp_bias
<
float
>(
float
*
o
f
,
float
*
b
,
Int
op
,
template
void
Convolution_fp_bias
<
float
>(
float
*
o
F
,
float
*
b
,
Int
nPlanes
,
Int
nActive
);
Int
nActive
);
template
void
Convolution_bp_bias
<
float
>(
float
*
matrix
,
float
*
target
,
template
void
Convolution_bp_bias
<
float
>(
float
*
d_oF
,
float
*
d_b
,
Int
nRows
,
Int
nColumn
s
,
Int
n
COLUMNS
);
Int
nPlane
s
,
Int
n
Active
);
template
double
dConvolution_forward2
<
float
>(
template
double
dConvolution_forward2
<
float
>(
float
*
inFeatures
,
float
*
outFeatures
,
float
*
w
,
RuleBook
_rules
,
float
*
inFeatures
,
float
*
outFeatures
,
float
*
w
,
RuleBook
_rules
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
);
...
@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>(
...
@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>(
Int
output_nPlanes
,
Int
output_stride
);
Int
output_nPlanes
,
Int
output_stride
);
template
void
InputLayer_fp
<
float
>(
float
*
input_features
,
template
void
InputLayer_fp
<
float
>(
float
*
input_features
,
float
*
output_features
,
Int
nRows
,
float
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
);
Int
*
rules_gpu
,
bool
average
);
template
void
InputLayer_bp
<
float
>(
float
*
d_input_features
,
template
void
InputLayer_bp
<
float
>(
float
*
d_input_features
,
float
*
d_output_features
,
Int
nRows
,
float
*
d_output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules_cpu
,
Int
*
rules_gpu
,
bool
average
);
Int
*
rules_gpu
,
bool
average
);
template
void
LeakyReLU_fp
<
float
>(
float
*
input_features
,
float
*
output_features
,
template
void
LeakyReLU_fp
<
float
>(
float
*
input_features
,
float
*
output_features
,
Int
n
,
float
alpha
);
Int
n
,
float
alpha
);
template
void
LeakyReLU_bp
<
float
>(
float
*
input_features
,
template
void
LeakyReLU_bp
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
float
*
d_input_features
,
float
*
output_features
,
Int
n
,
float
alpha
);
float
*
output_features
,
Int
n
,
float
alpha
);
template
void
cuda_MaxPooling_ForwardPass
<
float
>(
float
*
input_features
,
template
void
cuda_MaxPooling_ForwardPass
<
float
>(
float
*
input_features
,
float
*
output_features
,
float
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
output_stride
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
cuda_MaxPooling_BackwardPass
<
float
>(
template
void
cuda_MaxPooling_BackwardPass
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
float
*
output_features
,
float
*
input_features
,
float
*
d_input_features
,
float
*
output_features
,
float
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
float
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
cuda_SparseToDense_ForwardPass
<
float
>(
float
*
input_features
,
template
void
cuda_SparseToDense_ForwardPass
<
float
>(
float
*
input_features
,
float
*
output_features
,
float
*
output_features
,
Int
nPlanes
,
Int
nPlanes
,
Int
spatialVolume
,
Int
spatialVolume
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
cuda_SparseToDense_BackwardPass
<
float
>(
float
*
d_input_features
,
template
void
cuda_SparseToDense_BackwardPass
<
float
>(
float
*
d_input_features
,
float
*
d_output_features
,
float
*
d_output_features
,
Int
nPlanes
,
Int
nPlanes
,
Int
spatialVolume
,
Int
spatialVolume
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
cuda_UnPooling_ForwardPass
<
float
>(
float
*
input_features
,
template
void
cuda_UnPooling_ForwardPass
<
float
>(
float
*
input_features
,
float
*
output_features
,
float
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
output_stride
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
cuda_UnPooling_BackwardPass
<
float
>(
float
*
d_input_features
,
template
void
cuda_UnPooling_BackwardPass
<
float
>(
float
*
d_input_features
,
float
*
d_output_features
,
float
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
output_stride
,
RuleBook
_rules
);
RuleBook
_rules
);
template
void
bn_f
<
float
>(
float
*
iF
,
float
*
oF
,
Int
nPlanes
,
Int
input_stride
,
template
void
bn_f
<
float
>(
float
*
iF
,
float
*
oF
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
float
*
saveMean
,
Int
output_stride
,
Int
nActive
,
float
*
saveMean
,
float
*
saveInvStd
,
float
*
runningMean
,
float
*
saveInvStd
,
float
*
runningMean
,
float
*
runningVar
,
float
*
weight
,
float
*
bias
,
float
*
runningVar
,
float
*
weight
,
float
*
bias
,
float
eps
,
float
momentum
,
bool
train
,
float
eps
,
float
momentum
,
bool
train
,
float
leakiness
);
float
leakiness
);
template
void
bn_b
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
template
void
bn_b
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
float
*
output_features
,
float
*
d_output_features
,
float
*
output_features
,
float
*
d_output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
float
*
saveMean
,
float
*
saveInvStd
,
Int
nActive
,
float
*
saveMean
,
float
*
saveInvStd
,
float
*
runningMean
,
float
*
runningVar
,
float
*
weight
,
float
*
runningMean
,
float
*
runningVar
,
float
*
weight
,
float
*
bias
,
float
*
d_weight
,
float
*
d_bias
,
float
*
bias
,
float
*
d_weight
,
float
*
d_bias
,
float
leakiness
);
float
leakiness
);
template
void
bmd_f
<
float
>(
float
*
input_features
,
float
*
output_features
,
template
void
bmd_f
<
float
>(
float
*
input_features
,
float
*
output_features
,
float
*
noise
,
Int
nActive
,
Int
nPlanes
,
float
alpha
);
float
*
noise
,
Int
nActive
,
Int
nPlanes
,
float
alpha
);
template
void
bmd_b
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
template
void
bmd_b
<
float
>(
float
*
input_features
,
float
*
d_input_features
,
float
*
d_output_features
,
float
*
noise
,
Int
nActive
,
float
*
d_output_features
,
float
*
noise
,
Int
nActive
,
Int
nPlanes
,
float
alpha
);
Int
nPlanes
,
float
alpha
);
sparseconvnet/__init__.py
View file @
66986767
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
forward_pass_multiplyAdd_count
=
0
forward_pass_multiplyAdd_count
=
0
forward_pass_hidden_states
=
0
forward_pass_hidden_states
=
0
from
.activations
import
Tanh
,
Sigmoid
,
ReLU
,
ELU
,
SELU
,
BatchNormELU
from
.activations
import
Tanh
,
Sigmoid
,
ReLU
,
LeakyReLU
,
ELU
,
SELU
,
BatchNormELU
from
.averagePooling
import
AveragePooling
from
.averagePooling
import
AveragePooling
from
.batchNormalization
import
BatchNormalization
,
BatchNormReLU
,
BatchNormLeakyReLU
from
.batchNormalization
import
BatchNormalization
,
BatchNormReLU
,
BatchNormLeakyReLU
from
.classificationTrainValidate
import
ClassificationTrainValidate
from
.classificationTrainValidate
import
ClassificationTrainValidate
...
...
sparseconvnet/activations.py
View file @
66986767
...
@@ -22,6 +22,18 @@ class Sigmoid(Module):
...
@@ -22,6 +22,18 @@ class Sigmoid(Module):
return
output
return
output
class
LeakyReLU
(
Module
):
def
__init__
(
self
,
leak
=
1
/
3
):
Module
.
__init__
(
self
)
self
.
leak
=
leak
def
forward
(
self
,
input
):
output
=
SparseConvNetTensor
()
output
.
features
=
F
.
leaky_relu
(
input
.
features
,
self
.
leak
)
output
.
metadata
=
input
.
metadata
output
.
spatial_size
=
input
.
spatial_size
return
output
class
Tanh
(
Module
):
class
Tanh
(
Module
):
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
output
=
SparseConvNetTensor
()
output
=
SparseConvNetTensor
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment