Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
2c4ed608
"deploy/metrics/docker-compose.yml" did not exist on "df90e29eba6aaf62f8fa67b8e9092d8a81c25856"
Commit
2c4ed608
authored
Jun 20, 2018
by
Benjamin Thomas Graham
Browse files
Goodbye THNN. Hello ATen!
parent
6d4475db
Changes
145
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1431 additions
and
118 deletions
+1431
-118
sparseconvnet/SCN/CPU/BatchNormalization.h
sparseconvnet/SCN/CPU/BatchNormalization.h
+21
-21
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+38
-0
sparseconvnet/SCN/CPU/Convolution.cpp
sparseconvnet/SCN/CPU/Convolution.cpp
+365
-0
sparseconvnet/SCN/CPU/Deconvolution.cpp
sparseconvnet/SCN/CPU/Deconvolution.cpp
+89
-0
sparseconvnet/SCN/CPU/IOLayers.cpp
sparseconvnet/SCN/CPU/IOLayers.cpp
+190
-0
sparseconvnet/SCN/CPU/IOLayers.h
sparseconvnet/SCN/CPU/IOLayers.h
+11
-11
sparseconvnet/SCN/CPU/LeakyReLU.cpp
sparseconvnet/SCN/CPU/LeakyReLU.cpp
+32
-0
sparseconvnet/SCN/CPU/MaxPooling.cpp
sparseconvnet/SCN/CPU/MaxPooling.cpp
+110
-0
sparseconvnet/SCN/CPU/MaxPooling.h
sparseconvnet/SCN/CPU/MaxPooling.h
+13
-13
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
+42
-0
sparseconvnet/SCN/CPU/SparseToDense.cpp
sparseconvnet/SCN/CPU/SparseToDense.cpp
+61
-0
sparseconvnet/SCN/CPU/SparseToDense.h
sparseconvnet/SCN/CPU/SparseToDense.h
+7
-7
sparseconvnet/SCN/CPU/UnPooling.cpp
sparseconvnet/SCN/CPU/UnPooling.cpp
+56
-0
sparseconvnet/SCN/CPU/UnPooling.h
sparseconvnet/SCN/CPU/UnPooling.h
+33
-0
sparseconvnet/SCN/CUDA/ActivePooling.cu
sparseconvnet/SCN/CUDA/ActivePooling.cu
+67
-0
sparseconvnet/SCN/CUDA/ActivePooling.h
sparseconvnet/SCN/CUDA/ActivePooling.h
+55
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+47
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
+64
-66
sparseconvnet/SCN/CUDA/AveragePooling.cu
sparseconvnet/SCN/CUDA/AveragePooling.cu
+54
-0
sparseconvnet/SCN/CUDA/AveragePooling.h
sparseconvnet/SCN/CUDA/AveragePooling.h
+76
-0
No files found.
sparseconvnet/SCN/
generic/
CPU/BatchNormalization.h
→
sparseconvnet/SCN/CPU/BatchNormalization.h
View file @
2c4ed608
...
...
@@ -6,7 +6,7 @@
#ifndef CPU_BATCHNORMALIZATION_H
#define CPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <vector>
// in/output_stride is normally the same as nPlanes; allow other values to act
...
...
@@ -14,28 +14,28 @@
template
<
typename
T
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
std
::
memset
(
saveMean
,
0
,
nPlanes
*
sizeof
(
T
));
std
::
memset
(
saveInvStd
,
0
,
nPlanes
*
sizeof
(
T
));
for
(
u
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
saveMean
[
plane
]
+=
input_features
[
ci
];
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
saveMean
[
plane
]
/=
nActive
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
saveMean
[
plane
];
}
for
(
u
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
saveInvStd
[
plane
]
+=
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
(
input_features
[
ci
]
-
saveMean
[
plane
]);
// accumulate sum-squares
...
...
@@ -43,26 +43,26 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
// rooting
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
saveInvStd
[
plane
]
/
(
nActive
-
1
);
saveInvStd
[
plane
]
=
powf
(
saveInvStd
[
plane
]
/
nActive
+
eps
,
-
0.5
);
}
}
else
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
saveMean
[
plane
]
=
runningMean
[
plane
];
saveInvStd
[
plane
]
=
powf
(
runningVar
[
plane
]
+
eps
,
-
0.5
);
}
}
std
::
vector
<
T
>
w
(
nPlanes
);
std
::
vector
<
T
>
b
(
nPlanes
);
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
w
[
plane
]
=
saveInvStd
[
plane
]
*
(
weight
?
weight
[
plane
]
:
1
);
b
[
plane
]
=
-
saveMean
[
plane
]
*
w
[
plane
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
T
out
=
input_features
[
ci
]
*
w
[
plane
]
+
b
[
plane
];
out
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
output_features
[
co
]
=
out
;
...
...
@@ -73,17 +73,17 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
template
<
typename
T
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
std
::
vector
<
T
>
gradMean
(
nPlanes
);
std
::
vector
<
T
>
dotp
(
nPlanes
);
std
::
vector
<
T
>
k
(
nPlanes
);
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
...
...
@@ -91,15 +91,15 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
dotp
[
plane
]
+=
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
d
;
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
[
plane
];
// sum of grads, really, until ...
gradMean
[
plane
]
/=
nActive
;
// ...now
k
[
plane
]
=
dotp
[
plane
]
*
saveInvStd
[
plane
]
*
saveInvStd
[
plane
]
/
nActive
;
}
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
[
plane
]
-
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
k
[
plane
])
*
...
...
@@ -107,7 +107,7 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
}
}
if
(
d_weight
)
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
d_weight
[
plane
]
=
dotp
[
plane
]
*
saveInvStd
[
plane
];
}
}
...
...
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cpu_BatchwiseMultiplicativeDropout_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
noise
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
nz
=
noise
.
data
<
T
>
();
for
(
Int
row
=
0
;
row
<
nActive
;
row
++
)
for
(
Int
plane
=
0
,
o
=
row
*
nPlanes
,
i
=
row
*
nPlanes
;
plane
<
nPlanes
;
plane
++
,
o
++
,
i
++
)
oF
[
o
]
=
(
iF
[
i
]
>
0
)
?
iF
[
i
]
*
nz
[
plane
]
:
iF
[
i
]
*
nz
[
plane
]
*
alpha
;
}
template
<
typename
T
>
void
cpu_BatchwiseMultiplicativeDropout_updateGradInput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
nz
=
noise
.
data
<
T
>
();
for
(
Int
row
=
0
;
row
<
nActive
;
row
++
)
for
(
Int
plane
=
0
,
o
=
row
*
nPlanes
,
i
=
row
*
nPlanes
;
plane
<
nPlanes
;
plane
++
,
o
++
,
i
++
)
diF
[
i
]
=
(
iF
[
i
]
>
0
)
?
doF
[
o
]
*
nz
[
plane
]
:
doF
[
o
]
*
nz
[
plane
]
*
alpha
;
}
sparseconvnet/SCN/CPU/Convolution.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include <cstring>
template
<
typename
T
>
void
rule_index_select
(
at
::
Tensor
target
,
at
::
Tensor
src
,
Int
nRules
,
Int
*
rules
)
{
auto
t_ptr
=
target
.
data
<
T
>
();
auto
s_ptr
=
src
.
data
<
T
>
();
auto
n
=
target
.
size
(
1
);
for
(
int
i
=
0
;
i
<
nRules
;
++
i
)
std
::
memcpy
(
t_ptr
+
i
*
n
,
s_ptr
+
rules
[
2
*
i
]
*
n
,
sizeof
(
T
)
*
n
);
}
template
<
typename
T
>
void
rule_index_add_
(
at
::
Tensor
target
,
at
::
Tensor
src
,
Int
nRules
,
Int
*
rules
)
{
auto
t_ptr
=
target
.
data
<
T
>
();
auto
s_ptr
=
src
.
data
<
T
>
();
auto
n
=
target
.
size
(
1
);
for
(
int
i
=
0
;
i
<
nRules
;
++
i
)
{
auto
t
=
t_ptr
+
rules
[
2
*
i
]
*
n
;
auto
s
=
s_ptr
+
i
*
n
;
for
(
int
j
=
0
;
j
<
n
;
++
j
)
t
[
j
]
+=
s
[
j
];
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
sparseconvnet/SCN/CPU/Deconvolution.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
,
Int
Dimension
>
double
cpu_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 0), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
1
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
0
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 0));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 1), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
1
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
0
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
1
]);
}
}
}
sparseconvnet/SCN/CPU/IOLayers.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
output_features
.
data
<
T
>
(),
input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
d_output_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
auto
nPlanes
=
input_features
.
size
(
2
);
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
output_features
.
data
<
T
>
(),
input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
2
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
d_output_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
sparseconvnet/SCN/
generic/
CPU/IOLayers.h
→
sparseconvnet/SCN/CPU/IOLayers.h
View file @
2c4ed608
...
...
@@ -6,21 +6,21 @@
#ifndef CPU_IOLAYERS_H
#define CPU_IOLAYERS_H
#include "../SparseConvNet.h"
#include <cstring>
// Assume output and d_input_features have been zero-ed
template
<
typename
T
>
void
InputLayer_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
void
InputLayer_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
u
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
for
(
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
u
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
in_f
=
input_features
+
nPlanes
*
rules
[
i
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
output_features
[
plane
]
+=
multiplier
*
in_f
[
plane
];
}
}
...
...
@@ -30,14 +30,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
}
template
<
typename
T
>
void
InputLayer_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
for
(
u
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
u
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
d_in_f
=
d_input_features
+
nPlanes
*
rules
[
i
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_in_f
[
plane
]
+=
multiplier
*
d_output_features
[
plane
];
}
d_output_features
+=
nPlanes
;
...
...
sparseconvnet/SCN/CPU/LeakyReLU.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cpu_LeakyReLU_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
n
=
input_features
.
numel
();
for
(
Int
i
=
0
;
i
<
n
;
i
++
)
oF
[
i
]
=
(
iF
[
i
]
>
0
)
?
iF
[
i
]
:
iF
[
i
]
*
alpha
;
}
template
<
typename
T
>
void
cpu_LeakyReLU_updateGradInput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
n
=
d_input_features
.
numel
();
for
(
Int
i
=
0
;
i
<
n
;
i
++
)
diF
[
i
]
=
(
iF
[
i
]
>
0
)
?
doF
[
i
]
:
doF
[
i
]
*
alpha
;
}
sparseconvnet/SCN/CPU/MaxPooling.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
sparseconvnet/SCN/
generic/
CPU/MaxPooling.h
→
sparseconvnet/SCN/CPU/MaxPooling.h
View file @
2c4ed608
...
...
@@ -6,16 +6,16 @@
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
...
...
@@ -23,12 +23,12 @@ void MaxPooling_ForwardPass(T *input_features, T *output_features,
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
...
...
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
double
cpu_NetworkInNetwork_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
input_nPlanes
=
weight
.
size
(
0
);
auto
output_nPlanes
=
weight
.
size
(
1
);
output_features
.
resize_
({
nActive
,
output_nPlanes
});
if
(
bias
.
numel
())
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
output_features
.
addmm
(
input_features
,
weight
);
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
template
<
typename
T
>
void
cpu_NetworkInNetwork_updateGradInput
(
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
)
{
d_input_features
.
resize_
({(
int
)
d_output_features
.
size
(
0
),
weight
.
size
(
0
)});
d_input_features
.
zero_
();
at
::
mm_out
(
d_input_features
,
d_output_features
,
weight
.
t
());
}
template
<
typename
T
>
void
cpu_NetworkInNetwork_accGradParameters
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
at
::
mm_out
(
d_weight
,
input_features
.
t
(),
d_output_features
);
}
sparseconvnet/SCN/CPU/SparseToDense.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
oF
+=
_nPlanes
*
spatialVolume
;
}
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
doF
+=
_nPlanes
*
spatialVolume
;
}
}
}
sparseconvnet/SCN/
generic/
CPU/SparseToDense.h
→
sparseconvnet/SCN/CPU/SparseToDense.h
View file @
2c4ed608
...
...
@@ -6,29 +6,29 @@
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
...
...
sparseconvnet/SCN/CPU/UnPooling.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "UnPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
UnPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
UnPooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
&
r
[
0
],
nHot
);
}
}
sparseconvnet/SCN/
generic/
CPU/UnPooling.h
→
sparseconvnet/SCN/CPU/UnPooling.h
View file @
2c4ed608
...
...
@@ -6,27 +6,27 @@
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
uInt
filterVolume
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
uInt
filterVolume
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
...
...
sparseconvnet/SCN/CUDA/ActivePooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "ActivePooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
output_features
.
resize_
({
batchSize
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
sparseconvnet/SCN/
generic/GPU
/ActivePooling.h
→
sparseconvnet/SCN/
CUDA
/ActivePooling.h
View file @
2c4ed608
...
...
@@ -4,54 +4,52 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_ACTIVEPOOLING_H
#define
GPU
_ACTIVEPOOLING_H
#ifndef
CUDA
_ACTIVEPOOLING_H
#define
CUDA
_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
u
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
u
Int
nActive
=
*
r
++
;
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_fp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_fp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
u
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
u
Int
nActive
=
*
r
++
;
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_bp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_bp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/*
GPU
_ActivePOOLING_H */
#endif
/*
CUDA
_ActivePOOLING_H */
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h"
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
)
{
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
dAffineReluTrivialConvolution_forward
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
output_features
.
size
(
1
),
input_features
.
size
(
0
));
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
output_features
.
size
(
1
);
}
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
d_affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
d_affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
sparseconvnet/SCN/
generic/GPU
/AffineReluTrivialConvolution.h
→
sparseconvnet/SCN/
CUDA
/AffineReluTrivialConvolution.h
View file @
2c4ed608
...
...
@@ -4,18 +4,18 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_AFFINERELUTRIVIALCONVOLUTION_H
#define
GPU
_AFFINERELUTRIVIALCONVOLUTION_H
#ifndef
CUDA
_AFFINERELUTRIVIALCONVOLUTION_H
#define
CUDA
_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -24,9 +24,9 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
...
...
@@ -35,7 +35,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -52,7 +52,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -82,20 +82,20 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
...
...
@@ -104,7 +104,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -121,7 +121,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -158,20 +158,19 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, THCState_getCurrentStream(state)>>>
\
(
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
dAffineReluTrivialConvolution_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
\
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o); \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
...
...
@@ -179,10 +178,10 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
...
...
@@ -193,8 +192,8 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
template
<
>
void
dAffineReluTrivialConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
...
...
@@ -206,15 +205,15 @@ void dAffineReluTrivialConvolution_forward<double>(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -234,7 +233,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -253,7 +252,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
}
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
...
...
@@ -303,15 +302,15 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -331,7 +330,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -350,7 +349,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
}
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
...
...
@@ -406,20 +405,19 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, THCState_getCurrentStream(state)>>>
\
(
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineReluTrivialConvolution_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, affineWeight,
\
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
...
...
@@ -432,8 +430,8 @@ template <typename T>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
sparseconvnet/SCN/CUDA/AveragePooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
sparseconvnet/SCN/
generic/GPU
/AveragePooling.h
→
sparseconvnet/SCN/
CUDA
/AveragePooling.h
View file @
2c4ed608
...
...
@@ -4,27 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_AVERAGEPOOLING_H
#define
GPU
_AVERAGEPOOLING_H
#ifndef
CUDA
_AVERAGEPOOLING_H
#define
CUDA
_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
...
...
@@ -33,31 +33,31 @@ __global__ void AveragePooling_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
AveragePooling_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
uInt
nPlanes
,
u
Int
in
put_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
u
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
void
cuda_
AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
out
put_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
...
...
@@ -65,12 +65,12 @@ __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
}
template
<
typename
T
>
void
AveragePooling_BackwardPass
(
cudaStream_t
stream
,
T
*
d_
in
put_features
,
T
*
d_output_features
,
uInt
nPlanes
,
u
Int
in
put_stride
,
u
Int
output_stride
,
uInt
*
rules
,
u
Int
nHot
,
u
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
void
cuda_
AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
out
put_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/*
GPU
_AVERAGEPOOLING_H */
#endif
/*
CUDA
_AVERAGEPOOLING_H */
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment