Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
SparseConvNet
Commits
2c4ed608
Commit
2c4ed608
authored
Jun 20, 2018
by
Benjamin Thomas Graham
Browse files
Goodbye THNN. Hello ATen!
parent
6d4475db
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1431 additions
and
118 deletions
+1431
-118
sparseconvnet/SCN/CPU/BatchNormalization.h
sparseconvnet/SCN/CPU/BatchNormalization.h
+21
-21
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+38
-0
sparseconvnet/SCN/CPU/Convolution.cpp
sparseconvnet/SCN/CPU/Convolution.cpp
+365
-0
sparseconvnet/SCN/CPU/Deconvolution.cpp
sparseconvnet/SCN/CPU/Deconvolution.cpp
+89
-0
sparseconvnet/SCN/CPU/IOLayers.cpp
sparseconvnet/SCN/CPU/IOLayers.cpp
+190
-0
sparseconvnet/SCN/CPU/IOLayers.h
sparseconvnet/SCN/CPU/IOLayers.h
+11
-11
sparseconvnet/SCN/CPU/LeakyReLU.cpp
sparseconvnet/SCN/CPU/LeakyReLU.cpp
+32
-0
sparseconvnet/SCN/CPU/MaxPooling.cpp
sparseconvnet/SCN/CPU/MaxPooling.cpp
+110
-0
sparseconvnet/SCN/CPU/MaxPooling.h
sparseconvnet/SCN/CPU/MaxPooling.h
+13
-13
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
+42
-0
sparseconvnet/SCN/CPU/SparseToDense.cpp
sparseconvnet/SCN/CPU/SparseToDense.cpp
+61
-0
sparseconvnet/SCN/CPU/SparseToDense.h
sparseconvnet/SCN/CPU/SparseToDense.h
+7
-7
sparseconvnet/SCN/CPU/UnPooling.cpp
sparseconvnet/SCN/CPU/UnPooling.cpp
+56
-0
sparseconvnet/SCN/CPU/UnPooling.h
sparseconvnet/SCN/CPU/UnPooling.h
+33
-0
sparseconvnet/SCN/CUDA/ActivePooling.cu
sparseconvnet/SCN/CUDA/ActivePooling.cu
+67
-0
sparseconvnet/SCN/CUDA/ActivePooling.h
sparseconvnet/SCN/CUDA/ActivePooling.h
+55
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+47
-0
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
+64
-66
sparseconvnet/SCN/CUDA/AveragePooling.cu
sparseconvnet/SCN/CUDA/AveragePooling.cu
+54
-0
sparseconvnet/SCN/CUDA/AveragePooling.h
sparseconvnet/SCN/CUDA/AveragePooling.h
+76
-0
No files found.
sparseconvnet/SCN/
generic/
CPU/BatchNormalization.h
→
sparseconvnet/SCN/CPU/BatchNormalization.h
View file @
2c4ed608
...
...
@@ -6,7 +6,7 @@
#ifndef CPU_BATCHNORMALIZATION_H
#define CPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <vector>
// in/output_stride is normally the same as nPlanes; allow other values to act
...
...
@@ -14,28 +14,28 @@
template
<
typename
T
>
void
BatchNormalization_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
eps
,
T
momentum
,
bool
train
,
T
leakiness
)
{
if
(
train
)
{
std
::
memset
(
saveMean
,
0
,
nPlanes
*
sizeof
(
T
));
std
::
memset
(
saveInvStd
,
0
,
nPlanes
*
sizeof
(
T
));
for
(
u
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
saveMean
[
plane
]
+=
input_features
[
ci
];
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
saveMean
[
plane
]
/=
nActive
;
runningMean
[
plane
]
=
momentum
*
runningMean
[
plane
]
+
(
1
-
momentum
)
*
saveMean
[
plane
];
}
for
(
u
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
)
{
saveInvStd
[
plane
]
+=
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
(
input_features
[
ci
]
-
saveMean
[
plane
]);
// accumulate sum-squares
...
...
@@ -43,26 +43,26 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
// rooting
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
runningVar
[
plane
]
=
momentum
*
runningVar
[
plane
]
+
(
1
-
momentum
)
*
saveInvStd
[
plane
]
/
(
nActive
-
1
);
saveInvStd
[
plane
]
=
powf
(
saveInvStd
[
plane
]
/
nActive
+
eps
,
-
0.5
);
}
}
else
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
saveMean
[
plane
]
=
runningMean
[
plane
];
saveInvStd
[
plane
]
=
powf
(
runningVar
[
plane
]
+
eps
,
-
0.5
);
}
}
std
::
vector
<
T
>
w
(
nPlanes
);
std
::
vector
<
T
>
b
(
nPlanes
);
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
w
[
plane
]
=
saveInvStd
[
plane
]
*
(
weight
?
weight
[
plane
]
:
1
);
b
[
plane
]
=
-
saveMean
[
plane
]
*
w
[
plane
]
+
(
bias
?
bias
[
plane
]
:
0
);
}
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
T
out
=
input_features
[
ci
]
*
w
[
plane
]
+
b
[
plane
];
out
=
(
out
>
0
)
?
out
:
(
out
*
leakiness
);
output_features
[
co
]
=
out
;
...
...
@@ -73,17 +73,17 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
template
<
typename
T
>
void
BatchNormalization_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
nActive
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
nActive
,
T
*
saveMean
,
T
*
saveInvStd
,
T
*
runningMean
,
T
*
runningVar
,
T
*
weight
,
T
*
bias
,
T
*
d_weight
,
T
*
d_bias
,
T
leakiness
)
{
std
::
vector
<
T
>
gradMean
(
nPlanes
);
std
::
vector
<
T
>
dotp
(
nPlanes
);
std
::
vector
<
T
>
k
(
nPlanes
);
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
T
d
=
d_output_features
[
co
];
d
=
(
output_features
[
co
]
>
0
)
?
d
:
(
d
*
leakiness
);
d_output_features
[
co
]
=
d
;
...
...
@@ -91,15 +91,15 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
dotp
[
plane
]
+=
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
d
;
}
}
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
if
(
d_bias
)
d_bias
[
plane
]
=
gradMean
[
plane
];
// sum of grads, really, until ...
gradMean
[
plane
]
/=
nActive
;
// ...now
k
[
plane
]
=
dotp
[
plane
]
*
saveInvStd
[
plane
]
*
saveInvStd
[
plane
]
/
nActive
;
}
for
(
u
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
for
(
Int
row
=
0
,
ci
=
0
,
co
=
0
;
row
<
nActive
;
row
++
,
ci
+=
input_stride
-
nPlanes
,
co
+=
output_stride
-
nPlanes
)
{
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
,
ci
++
,
co
++
)
{
d_input_features
[
ci
]
=
(
d_output_features
[
co
]
-
gradMean
[
plane
]
-
(
input_features
[
ci
]
-
saveMean
[
plane
])
*
k
[
plane
])
*
...
...
@@ -107,7 +107,7 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
}
}
if
(
d_weight
)
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
d_weight
[
plane
]
=
dotp
[
plane
]
*
saveInvStd
[
plane
];
}
}
...
...
sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cpu_BatchwiseMultiplicativeDropout_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
noise
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
nz
=
noise
.
data
<
T
>
();
for
(
Int
row
=
0
;
row
<
nActive
;
row
++
)
for
(
Int
plane
=
0
,
o
=
row
*
nPlanes
,
i
=
row
*
nPlanes
;
plane
<
nPlanes
;
plane
++
,
o
++
,
i
++
)
oF
[
o
]
=
(
iF
[
i
]
>
0
)
?
iF
[
i
]
*
nz
[
plane
]
:
iF
[
i
]
*
nz
[
plane
]
*
alpha
;
}
template
<
typename
T
>
void
cpu_BatchwiseMultiplicativeDropout_updateGradInput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
noise
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
nActive
=
input_features
.
size
(
0
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
nz
=
noise
.
data
<
T
>
();
for
(
Int
row
=
0
;
row
<
nActive
;
row
++
)
for
(
Int
plane
=
0
,
o
=
row
*
nPlanes
,
i
=
row
*
nPlanes
;
plane
<
nPlanes
;
plane
++
,
o
++
,
i
++
)
diF
[
i
]
=
(
iF
[
i
]
>
0
)
?
doF
[
o
]
*
nz
[
plane
]
:
doF
[
o
]
*
nz
[
plane
]
*
alpha
;
}
sparseconvnet/SCN/CPU/Convolution.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include <cstring>
template
<
typename
T
>
void
rule_index_select
(
at
::
Tensor
target
,
at
::
Tensor
src
,
Int
nRules
,
Int
*
rules
)
{
auto
t_ptr
=
target
.
data
<
T
>
();
auto
s_ptr
=
src
.
data
<
T
>
();
auto
n
=
target
.
size
(
1
);
for
(
int
i
=
0
;
i
<
nRules
;
++
i
)
std
::
memcpy
(
t_ptr
+
i
*
n
,
s_ptr
+
rules
[
2
*
i
]
*
n
,
sizeof
(
T
)
*
n
);
}
template
<
typename
T
>
void
rule_index_add_
(
at
::
Tensor
target
,
at
::
Tensor
src
,
Int
nRules
,
Int
*
rules
)
{
auto
t_ptr
=
target
.
data
<
T
>
();
auto
s_ptr
=
src
.
data
<
T
>
();
auto
n
=
target
.
size
(
1
);
for
(
int
i
=
0
;
i
<
nRules
;
++
i
)
{
auto
t
=
t_ptr
+
rules
[
2
*
i
]
*
n
;
auto
s
=
s_ptr
+
i
*
n
;
for
(
int
j
=
0
;
j
<
n
;
++
j
)
t
[
j
]
+=
s
[
j
];
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_Convolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_Convolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_SubmanifoldConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_SubmanifoldConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
filterSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getSubmanifoldRuleBook
(
inputSize
,
filterSize
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_FullConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_FullConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
mIn
,
Metadata
<
Dimension
>
&
mOut
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
mIn
.
getFullConvolutionRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
mOut
);
Int
nActive
=
mOut
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
template
<
typename
T
,
Int
Dimension
>
double
cpu_RandomizedStrideConvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 1), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
1
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideConvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 0));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 1));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
0
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
1
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
0
]);
}
}
}
sparseconvnet/SCN/CPU/Deconvolution.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
,
Int
Dimension
>
double
cpu_Deconvolution_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
weight
.
size
(
2
)});
if
(
bias
.
numel
()
and
nActive
)
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
double
flops
=
0
;
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
flops
+=
nRules
*
ip
*
op
;
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto w = weight.select(0, i);
// auto output_rows = at::mm(input_rows, w);
// output_features.index_add_(0, rt.select(1, 0), output_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
1
]);
auto
w
=
weight
.
select
(
0
,
i
);
auto
output_rows
=
at
::
mm
(
input_rows
,
w
);
rule_index_add_
<
T
>
(
output_features
,
output_rows
,
nRules
,
&
r
[
0
]);
}
}
return
flops
;
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_Deconvolution_backward
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
filterSize
,
/*long*/
at
::
Tensor
filterStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
filterSize
,
filterStride
,
true
);
Int
nActive
=
m
.
getNActive
(
inputSize
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
auto
ip
=
weight
.
size
(
1
);
auto
op
=
weight
.
size
(
2
);
for
(
Int
i
=
0
;
i
<
(
Int
)
_rules
.
size
();
i
++
)
{
auto
r
=
_rules
[
i
];
int
nRules
=
r
.
size
()
/
2
;
if
(
nRules
)
{
auto
w
=
weight
.
select
(
0
,
i
);
auto
dw
=
d_weight
.
select
(
0
,
i
);
// auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
// auto input_rows = input_features.index_select(0, rt.select(1, 1));
// auto d_output_rows = d_output_features.index_select(0, rt.select(1,
// 0));
// at::mm_out(dw, input_rows.t(), d_output_rows);
// auto d_input_rows = at::mm(d_output_rows, w.t());
// d_input_features.index_add_(0, rt.select(1, 1), d_input_rows);
auto
input_rows
=
input_features
.
type
().
tensor
({
nRules
,
ip
});
rule_index_select
<
T
>
(
input_rows
,
input_features
,
nRules
,
&
r
[
1
]);
auto
d_output_rows
=
d_output_features
.
type
().
tensor
({
nRules
,
op
});
rule_index_select
<
T
>
(
d_output_rows
,
d_output_features
,
nRules
,
&
r
[
0
]);
at
::
mm_out
(
dw
,
input_rows
.
t
(),
d_output_rows
);
auto
d_input_rows
=
at
::
mm
(
d_output_rows
,
w
.
t
());
rule_index_add_
<
T
>
(
d_input_features
,
d_input_rows
,
nRules
,
&
r
[
1
]);
}
}
}
sparseconvnet/SCN/CPU/IOLayers.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_InputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
batchSize
,
long
mode
)
{
m
.
inputLayer
(
spatialSize
,
input_coords
,
batchSize
,
mode
);
auto
nPlanes
=
input_features
.
size
(
1
);
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_InputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
d_input_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_OutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
nPlanes
});
output_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
output_features
.
data
<
T
>
(),
input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_OutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
inputLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
3
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
d_output_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLInputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*long*/
at
::
Tensor
spatialSize
,
/*long*/
at
::
Tensor
input_coords
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
mode
)
{
m
.
blLayer
(
spatialSize
,
input_coords
,
mode
);
auto
nPlanes
=
input_features
.
size
(
2
);
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
}
else
{
output_features
.
resize_
({
*
m
.
inputNActive
,
nPlanes
});
output_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLInputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
d_input_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
d_input_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
mode
==
4
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLOutputLayer_updateOutput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
input_features
.
size
(
1
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
output_features
.
resize_as_
(
input_features
);
output_features
.
copy_
(
input_features
);
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
}
else
{
output_features
.
resize_
({
rules
[
0
][
2
],
rules
[
0
][
3
],
nPlanes
});
output_features
.
zero_
();
InputLayer_BackwardPass
<
T
>
(
output_features
.
data
<
T
>
(),
input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_BLOutputLayer_updateGradInput
(
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
auto
&
rules
=
m
.
blLayerRuleBook
;
auto
nPlanes
=
d_output_features
.
size
(
2
);
auto
mode
=
rules
[
0
][
0
];
auto
maxActive
=
rules
[
0
][
1
];
auto
nRows
=
rules
[
0
][
4
];
if
(
mode
==
0
)
{
d_input_features
.
resize_as_
(
d_output_features
);
d_input_features
.
copy_
(
d_output_features
);
d_input_features
.
resize_
({
nRows
,
nPlanes
});
}
else
{
d_input_features
.
resize_
({
nRows
,
nPlanes
});
d_input_features
.
zero_
();
InputLayer_ForwardPass
<
T
>
(
d_output_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
nRows
,
maxActive
,
nPlanes
,
&
rules
[
1
][
0
],
false
);
}
}
sparseconvnet/SCN/
generic/
CPU/IOLayers.h
→
sparseconvnet/SCN/CPU/IOLayers.h
View file @
2c4ed608
...
...
@@ -6,21 +6,21 @@
#ifndef CPU_IOLAYERS_H
#define CPU_IOLAYERS_H
#include "../SparseConvNet.h"
#include <cstring>
// Assume output and d_input_features have been zero-ed
template
<
typename
T
>
void
InputLayer_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
void
InputLayer_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
u
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
for
(
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
u
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
in_f
=
input_features
+
nPlanes
*
rules
[
i
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
{
output_features
[
plane
]
+=
multiplier
*
in_f
[
plane
];
}
}
...
...
@@ -30,14 +30,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
}
template
<
typename
T
>
void
InputLayer_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nRows
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
bool
average
)
{
for
(
u
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
Int
nRows
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
for
(
Int
row
=
0
;
row
<
nRows
;
row
++
)
{
auto
nActive
=
rules
[
0
];
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
for
(
u
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
for
(
Int
i
=
1
;
i
<=
nActive
;
++
i
)
{
auto
d_in_f
=
d_input_features
+
nPlanes
*
rules
[
i
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_in_f
[
plane
]
+=
multiplier
*
d_output_features
[
plane
];
}
d_output_features
+=
nPlanes
;
...
...
sparseconvnet/SCN/CPU/LeakyReLU.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
void
cpu_LeakyReLU_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
float
alpha
)
{
output_features
.
resize_as_
(
input_features
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
n
=
input_features
.
numel
();
for
(
Int
i
=
0
;
i
<
n
;
i
++
)
oF
[
i
]
=
(
iF
[
i
]
>
0
)
?
iF
[
i
]
:
iF
[
i
]
*
alpha
;
}
template
<
typename
T
>
void
cpu_LeakyReLU_updateGradInput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
float
alpha
)
{
d_input_features
.
resize_as_
(
d_output_features
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
auto
n
=
d_input_features
.
numel
();
for
(
Int
i
=
0
;
i
<
n
;
i
++
)
diF
[
i
]
=
(
iF
[
i
]
>
0
)
?
doF
[
i
]
:
doF
[
i
]
*
alpha
;
}
sparseconvnet/SCN/CPU/MaxPooling.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_MaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideMaxPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_RandomizedStrideMaxPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRandomizedStrideRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
MaxPooling_BackwardPass
<
T
>
(
iF
,
diF
,
oF
,
doF
,
nPlanes
,
input_features
.
stride
(
0
),
output_features
.
stride
(
0
),
&
r
[
0
],
nHot
);
}
}
sparseconvnet/SCN/
generic/
CPU/MaxPooling.h
→
sparseconvnet/SCN/CPU/MaxPooling.h
View file @
2c4ed608
...
...
@@ -6,16 +6,16 @@
#ifndef CPU_MAXPOOLING_H
#define CPU_MAXPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
MaxPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
<
input_features
[
i
+
plane
])
output_features
[
o
+
plane
]
=
input_features
[
i
+
plane
];
}
...
...
@@ -23,12 +23,12 @@ void MaxPooling_ForwardPass(T *input_features, T *output_features,
template
<
typename
T
>
void
MaxPooling_BackwardPass
(
T
*
input_features
,
T
*
d_input_features
,
T
*
output_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
+
1
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
if
(
output_features
[
o
+
plane
]
==
input_features
[
i
+
plane
])
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
...
...
sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
template
<
typename
T
>
double
cpu_NetworkInNetwork_updateOutput
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
/*float*/
at
::
Tensor
weight
,
/*float*/
at
::
Tensor
bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
auto
input_nPlanes
=
weight
.
size
(
0
);
auto
output_nPlanes
=
weight
.
size
(
1
);
output_features
.
resize_
({
nActive
,
output_nPlanes
});
if
(
bias
.
numel
())
output_features
.
copy_
(
bias
);
else
output_features
.
zero_
();
output_features
.
addmm
(
input_features
,
weight
);
return
nActive
*
input_nPlanes
*
output_nPlanes
;
}
template
<
typename
T
>
void
cpu_NetworkInNetwork_updateGradInput
(
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
weight
)
{
d_input_features
.
resize_
({(
int
)
d_output_features
.
size
(
0
),
weight
.
size
(
0
)});
d_input_features
.
zero_
();
at
::
mm_out
(
d_input_features
,
d_output_features
,
weight
.
t
());
}
template
<
typename
T
>
void
cpu_NetworkInNetwork_accGradParameters
(
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_output_features
,
/*float*/
at
::
Tensor
d_weight
,
/*float*/
at
::
Tensor
d_bias
)
{
auto
nActive
=
input_features
.
size
(
0
);
if
(
nActive
and
d_bias
.
numel
())
at
::
sum_out
(
d_bias
,
d_output_features
,
{
0
},
false
);
at
::
mm_out
(
d_weight
,
input_features
.
t
(),
d_output_features
);
}
sparseconvnet/SCN/CPU/SparseToDense.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nPlanes
)
{
{
std
::
array
<
long
,
Dimension
+
2
>
sz
;
sz
[
0
]
=
m
.
grids
.
begin
()
->
second
.
size
();
// batch size
sz
[
1
]
=
nPlanes
;
long
*
in_sz
=
inputSize
.
data
<
long
>
();
for
(
Int
i
=
0
;
i
<
Dimension
;
++
i
)
sz
[
i
+
2
]
=
in_sz
[
i
];
output_features
.
resize_
(
sz
);
output_features
.
zero_
();
}
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
Int
_nPlanes
=
input_features
.
size
(
1
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
SparseToDense_ForwardPass
<
T
>
(
iF
,
oF
,
_nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
oF
+=
_nPlanes
*
spatialVolume
;
}
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_SparseToDense_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
)
{
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
if
(
input_features
.
ndimension
()
==
2
)
{
auto
_rules
=
m
.
getSparseToDenseRuleBook
(
inputSize
,
true
);
long
spatialVolume
=
inputSize
.
prod
().
data
<
long
>
()[
0
];
Int
_nPlanes
=
d_input_features
.
size
(
1
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
SparseToDense_BackwardPass
<
T
>
(
diF
,
doF
,
_nPlanes
,
spatialVolume
,
&
r
[
0
],
nHot
);
doF
+=
_nPlanes
*
spatialVolume
;
}
}
}
sparseconvnet/SCN/
generic/
CPU/SparseToDense.h
→
sparseconvnet/SCN/CPU/SparseToDense.h
View file @
2c4ed608
...
...
@@ -6,29 +6,29 @@
#ifndef CPU_SPARSETODENSE_H
#define CPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
SparseToDense_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
i
=
input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
o
=
output_features
+
rules
[
2
*
outSite
+
1
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
o
[
plane
*
spatialVolume
]
=
i
[
plane
];
}
}
template
<
typename
T
>
void
SparseToDense_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
spatialVolume
,
u
Int
*
rules
,
Int
nPlanes
,
Int
spatialVolume
,
Int
*
rules
,
int
nHot
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
T
*
d_i
=
d_input_features
+
rules
[
2
*
outSite
]
*
nPlanes
;
T
*
d_o
=
d_output_features
+
rules
[
2
*
outSite
+
1
];
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_i
[
plane
]
=
d_o
[
plane
*
spatialVolume
];
}
}
...
...
sparseconvnet/SCN/CPU/UnPooling.cpp
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "UnPooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
UnPooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
&
r
[
0
],
nHot
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cpu_UnPooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*float*/
at
::
Tensor
input_features
,
/*float*/
at
::
Tensor
d_input_features
,
/*float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
outputSize
,
inputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
auto
&
r
:
_rules
)
{
Int
nHot
=
r
.
size
()
/
2
;
UnPooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
&
r
[
0
],
nHot
);
}
}
sparseconvnet/SCN/
generic/
CPU/UnPooling.h
→
sparseconvnet/SCN/CPU/UnPooling.h
View file @
2c4ed608
...
...
@@ -6,27 +6,27 @@
#ifndef CPU_UNPOOLING_H
#define CPU_UNPOOLING_H
#include "../SparseConvNet.h"
template
<
typename
T
>
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
uInt
filterVolume
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
void
UnPooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
output_features
[
o
+
plane
]
+=
input_features
[
i
+
plane
];
}
}
template
<
typename
T
>
void
UnPooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
uInt
filterVolume
)
{
for
(
u
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
u
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
u
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
u
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
)
{
for
(
Int
outSite
=
0
;
outSite
<
nHot
;
outSite
++
)
{
Int
i
=
rules
[
2
*
outSite
+
1
]
*
input_stride
;
Int
o
=
rules
[
2
*
outSite
]
*
output_stride
;
for
(
Int
plane
=
0
;
plane
<
nPlanes
;
plane
++
)
d_input_features
[
i
+
plane
]
+=
d_output_features
[
o
+
plane
];
}
}
...
...
sparseconvnet/SCN/CUDA/ActivePooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "ActivePooling.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
output_features
.
resize_
({
batchSize
,
nPlanes
});
output_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
auto
iF
=
input_features
.
data
<
T
>
();
auto
oF
=
output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_ForwardPass
<
T
>
(
iF
,
oF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_ActivePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
bool
average
)
{
Int
nPlanes
=
input_features
.
size
(
1
);
auto
_rules
=
m
.
getActivePoolingRuleBook
(
inputSize
);
Int
batchSize
=
_rules
[
1
][
0
];
Int
maxActive
=
_rules
[
1
][
1
];
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
rulesBuffer
=
at
::
CUDA
(
at_kINT
).
tensor
({
1
<<
22
});
Int
*
rb
=
rulesBuffer
.
data
<
Int
>
();
Int
rowBatchSize
=
std
::
min
((
Int
)
32768
,
(
1
<<
22
)
/
(
maxActive
+
1
));
assert
(
rowBatchSize
>
0
);
auto
diF
=
d_input_features
.
data
<
T
>
();
auto
doF
=
d_output_features
.
data
<
T
>
();
for
(
Int
o
=
0
;
o
<
batchSize
;
o
+=
rowBatchSize
)
{
Int
batchSize_
=
std
::
min
(
rowBatchSize
,
(
Int
(
batchSize
-
o
)));
cudaMemcpy
(
rb
,
&
_rules
[
0
][
o
*
(
maxActive
+
1
)],
sizeof
(
Int
)
*
(
maxActive
+
1
)
*
batchSize_
,
cudaMemcpyHostToDevice
);
ActivePooling_BackwardPass
<
T
>
(
diF
,
doF
+
o
*
nPlanes
,
batchSize_
,
maxActive
,
nPlanes
,
rb
,
average
);
}
}
sparseconvnet/SCN/
generic/GPU
/ActivePooling.h
→
sparseconvnet/SCN/
CUDA
/ActivePooling.h
View file @
2c4ed608
...
...
@@ -4,54 +4,52 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_ACTIVEPOOLING_H
#define
GPU
_ACTIVEPOOLING_H
#ifndef
CUDA
_ACTIVEPOOLING_H
#define
CUDA
_ACTIVEPOOLING_H
template
<
typename
T
>
__global__
void
ActivePooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
output_features
[
blockIdx
.
x
*
nPlanes
];
u
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
u
Int
nActive
=
*
r
++
;
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
input_features
[(
*
r
++
)
*
nPlanes
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
out
[
plane
]
+=
inp
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_ForwardPass
(
T
*
input_features
,
T
*
output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_fp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_fp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
input_features
,
output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
template
<
typename
T
>
__global__
void
ActivePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
maxActive
,
u
Int
nPlanes
,
u
Int
*
rules
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
T
*
out
=
&
d_output_features
[
blockIdx
.
x
*
nPlanes
];
u
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
u
Int
nActive
=
*
r
++
;
Int
*
r
=
&
rules
[
blockIdx
.
x
*
(
maxActive
+
1
)];
Int
nActive
=
*
r
++
;
T
multiplier
=
(
average
and
nActive
>
0
)
?
1.0
f
/
nActive
:
1.0
f
;
while
(
nActive
--
>
0
)
{
T
*
inp
=
&
d_input_features
[(
*
r
++
)
*
nPlanes
];
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
32
)
inp
[
plane
]
=
out
[
plane
]
*
multiplier
;
}
}
template
<
typename
T
>
void
ActivePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_output_features
,
uInt
batchSize
,
uInt
maxActive
,
uInt
nPlanes
,
uInt
*
rules
,
bool
average
)
{
uInt
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
uInt
)
32
);
ActivePooling_bp
<
T
>
<<
<
batchSize
,
kernelBlockDim
,
0
,
THCState_getCurrentStream
(
state
)
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
Int
batchSize
,
Int
maxActive
,
Int
nPlanes
,
Int
*
rules
,
bool
average
)
{
Int
kernelBlockDim
=
std
::
min
(
nPlanes
,
(
Int
)
32
);
ActivePooling_bp
<
T
><<<
batchSize
,
kernelBlockDim
>>>
(
d_input_features
,
d_output_features
,
maxActive
,
nPlanes
,
rules
,
average
);
}
#endif
/*
GPU
_ActivePOOLING_H */
#endif
/*
CUDA
_ActivePOOLING_H */
sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AffineReluTrivialConvolution.h"
template
<
typename
T
>
double
cuda_AffineReluTrivialConvolution_updateOutput
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
)
{
output_features
.
resize_
({
input_features
.
size
(
0
),
convWeight
.
size
(
1
)});
dAffineReluTrivialConvolution_forward
<
T
>
(
input_features
.
data
<
T
>
(),
output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
output_features
.
size
(
1
),
input_features
.
size
(
0
));
return
input_features
.
size
(
0
)
*
input_features
.
size
(
1
)
*
output_features
.
size
(
1
);
}
template
<
typename
T
>
void
cuda_AffineReluTrivialConvolution_backward
(
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
/*cuda float*/
at
::
Tensor
affineWeight
,
/*cuda float*/
at
::
Tensor
d_affineWeight
,
/*cuda float*/
at
::
Tensor
affineBias
,
/*cuda float*/
at
::
Tensor
d_affineBias
,
/*cuda float*/
at
::
Tensor
convWeight
,
/*cuda float*/
at
::
Tensor
d_convWeight
,
bool
additiveGrad
)
{
d_input_features
.
resize_as_
(
input_features
);
dAffineReluTrivialConvolution_backward_dW
<
T
>
(
input_features
.
data
<
T
>
(),
d_input_features
.
data
<
T
>
(),
d_output_features
.
data
<
T
>
(),
affineWeight
.
data
<
T
>
(),
d_affineWeight
.
data
<
T
>
(),
affineBias
.
data
<
T
>
(),
d_affineBias
.
data
<
T
>
(),
convWeight
.
data
<
T
>
(),
d_convWeight
.
data
<
T
>
(),
convWeight
.
size
(
0
),
input_features
.
stride
(
0
),
convWeight
.
size
(
1
),
d_output_features
.
stride
(
0
),
input_features
.
size
(
0
),
additiveGrad
);
}
sparseconvnet/SCN/
generic/GPU
/AffineReluTrivialConvolution.h
→
sparseconvnet/SCN/
CUDA
/AffineReluTrivialConvolution.h
View file @
2c4ed608
...
...
@@ -4,18 +4,18 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_AFFINERELUTRIVIALCONVOLUTION_H
#define
GPU
_AFFINERELUTRIVIALCONVOLUTION_H
#ifndef
CUDA
_AFFINERELUTRIVIALCONVOLUTION_H
#define
CUDA
_AFFINERELUTRIVIALCONVOLUTION_H
// check if A+B is faster than just B
// check if loading affineBias into shared memory is faster than loading
// multiple times (if not try 64,16 backwards case)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// nActive must be a multiple of K!!
// Input x Weight -> Output
...
...
@@ -24,9 +24,9 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
...
...
@@ -35,7 +35,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -52,7 +52,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -82,20 +82,20 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
inFeatures
+=
K
;
}
}
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
u
Int
M
=
input_nPlanes
/
K
;
Int
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
u
Int
n
=
blockIdx
.
y
;
Int
n
=
blockIdx
.
y
;
outFeatures
+=
n
*
K
;
convWeight
+=
n
*
K
;
...
...
@@ -104,7 +104,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -121,7 +121,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
CW
[
ty
[
v
]][
tx
]
=
convWeight
[
ty
[
v
]
*
output_nPlanes
+
tx
];
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
// Read input, do affine + relu, set O[]
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
{
...
...
@@ -158,20 +158,19 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_forwardA<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), output_nPlanes / K),
\
dim3(K, K / V)
, 0, THCState_getCurrentStream(state)>>>
\
(
inFeatures, outFeatures, affineWeight, affineBias, convWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o); \
dAffineReluTrivialConvolution_forwardA<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, outFeatures, affineWeight, affineBias, convWeight,
\
input_nPlanes, input_stride, output_nPlanes, output_stride, o);
\
if (nActive > o) \
dAffineReluTrivialConvolution_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o); \
dAffineReluTrivialConvolution_forwardB< \
T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, outFeatures + o * output_stride, \
affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
output_nPlanes, output_stride, nActive - o); \
return; \
} \
}
...
...
@@ -179,10 +178,10 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
template
<
typename
T
>
void
dAffineReluTrivialConvolution_forward
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
affineWeight
,
T
*
affineBias
,
T
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
T
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
...
...
@@ -193,8 +192,8 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
template
<
>
void
dAffineReluTrivialConvolution_forward
<
double
>
(
double
*
inFeatures
,
double
*
outFeatures
,
double
*
affineWeight
,
double
*
affineBias
,
double
*
convWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
)
{
double
*
affineBias
,
double
*
convWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
...
...
@@ -206,15 +205,15 @@ void dAffineReluTrivialConvolution_forward<double>(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -234,7 +233,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -253,7 +252,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
}
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
...
...
@@ -303,15 +302,15 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template
<
typename
T
,
u
Int
K
,
u
Int
V
>
template
<
typename
T
,
Int
K
,
Int
V
>
__global__
void
dAffineReluTrivialConvolution_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
// M = gridDim.y == input_nPlanes / K
u
Int
N
=
output_nPlanes
/
K
;
u
Int
m
=
blockIdx
.
y
;
Int
N
=
output_nPlanes
/
K
;
Int
m
=
blockIdx
.
y
;
inFeatures
+=
m
*
K
;
dInFeatures
+=
m
*
K
;
convWeight
+=
m
*
K
*
output_nPlanes
;
...
...
@@ -331,7 +330,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
__shared__
T
AW
[
K
];
__shared__
T
AB
[
K
];
__shared__
T
CW
[
K
][
K
];
const
u
Int
tx
=
threadIdx
.
x
;
const
Int
tx
=
threadIdx
.
x
;
int
ty
[
V
];
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
...
...
@@ -350,7 +349,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
}
__syncthreads
();
for
(
u
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
for
(
Int
s
=
blockIdx
.
x
*
K
;
s
<
nActive
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
=
0
;
...
...
@@ -406,24 +405,23 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
u
Int o = (nActive / K) * K; \
Int o = (nActive / K) * K; \
if (o > 0) \
dAffineReluTrivialConvolution_backward_dW_A<
T, K, V> << <
\
dim3(std::min(o / K, (
u
Int)512), input_nPlanes / K),
\
dim3(K, K / V)
, 0, THCState_getCurrentStream(state)>>>
\
(
inFeatures, dInFeatures, dOutFeatures, affineWeight, \
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight, \
input_nPlanes, input_stride, output_nPlanes, output_stride, o, \
additiveGrad); \
dAffineReluTrivialConvolution_backward_dW_A<
\
T, K, V><<<
dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)
>>>(
\
inFeatures, dInFeatures, dOutFeatures, affineWeight,
\
dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight,
\
input_nPlanes, input_stride, output_nPlanes, output_stride, o,
\
additiveGrad);
\
if (nActive > o) \
dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, \
THCState_getCurrentStream(state)>>> \
(inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
dAffineReluTrivialConvolution_backward_dW_B< \
T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures + o * input_stride, dInFeatures + o * input_stride, \
dOutFeatures + o * output_stride, affineWeight, dAffineWeight, \
affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes, \
input_stride, output_nPlanes, output_stride, nActive - o, \
additiveGrad); \
return; \
} \
}
...
...
@@ -432,8 +430,8 @@ template <typename T>
void
dAffineReluTrivialConvolution_backward_dW
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
affineWeight
,
T
*
dAffineWeight
,
T
*
affineBias
,
T
*
dAffineBias
,
T
*
convWeight
,
T
*
dConvWeight
,
u
Int
input_nPlanes
,
u
Int
input_stride
,
u
Int
output_nPlanes
,
u
Int
output_stride
,
u
Int
nActive
,
bool
additiveGrad
)
{
T
*
dConvWeight
,
Int
input_nPlanes
,
Int
input_stride
,
Int
output_nPlanes
,
Int
output_stride
,
Int
nActive
,
bool
additiveGrad
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
...
...
sparseconvnet/SCN/CUDA/AveragePooling.cu
0 → 100644
View file @
2c4ed608
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "AveragePooling.h"
#include "RuleBookIterator.h"
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateOutput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
Int
nActive
=
m
.
getNActive
(
outputSize
);
output_features
.
resize_
({
nActive
,
input_features
.
size
(
1
)
-
nFeaturesToDrop
});
output_features
.
zero_
();
auto
iF
=
input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
oF
=
output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_ForwardPass
<
T
>
(
iF
,
oF
,
nPlanes
,
input_features
.
size
(
1
),
output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
template
<
typename
T
,
Int
Dimension
>
void
cuda_AveragePooling_updateGradInput
(
/*long*/
at
::
Tensor
inputSize
,
/*long*/
at
::
Tensor
outputSize
,
/*long*/
at
::
Tensor
poolSize
,
/*long*/
at
::
Tensor
poolStride
,
Metadata
<
Dimension
>
&
m
,
/*cuda float*/
at
::
Tensor
input_features
,
/*cuda float*/
at
::
Tensor
d_input_features
,
/*cuda float*/
at
::
Tensor
d_output_features
,
long
nFeaturesToDrop
)
{
Int
nPlanes
=
input_features
.
size
(
1
)
-
nFeaturesToDrop
;
auto
_rules
=
m
.
getRuleBook
(
inputSize
,
outputSize
,
poolSize
,
poolStride
,
true
);
d_input_features
.
resize_as_
(
input_features
);
d_input_features
.
zero_
();
auto
diF
=
d_input_features
.
data
<
T
>
()
+
nFeaturesToDrop
;
auto
doF
=
d_output_features
.
data
<
T
>
();
RULEBOOKITERATOR
(
cuda_AveragePooling_BackwardPass
<
T
>
(
diF
,
doF
,
nPlanes
,
input_features
.
size
(
1
),
d_output_features
.
size
(
1
),
rbB
,
nHotB
,
_rules
.
size
());
,
)
}
sparseconvnet/SCN/
generic/GPU
/AveragePooling.h
→
sparseconvnet/SCN/
CUDA
/AveragePooling.h
View file @
2c4ed608
...
...
@@ -4,27 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef
GPU
_AVERAGEPOOLING_H
#define
GPU
_AVERAGEPOOLING_H
#ifndef
CUDA
_AVERAGEPOOLING_H
#define
CUDA
_AVERAGEPOOLING_H
// NTX must be >=2 so r is filled properly
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_fp
(
T
*
input_features
,
T
*
output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
atomicAdd
(
&
output_features
[
o
+
plane
],
alpha
*
input_features
[
i
+
plane
]);
}
...
...
@@ -33,31 +33,31 @@ __global__ void AveragePooling_fp(T *input_features, T *output_features,
}
template
<
typename
T
>
void
AveragePooling_ForwardPass
(
cudaStream_t
stream
,
T
*
in
put_features
,
T
*
output_features
,
uInt
nPlanes
,
u
Int
in
put_stride
,
uInt
output_stride
,
uInt
*
rules
,
uInt
nHot
,
u
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
void
cuda_
AveragePooling_ForwardPass
(
T
*
input_features
,
T
*
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
out
put_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_fp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
input_features
,
output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
template
<
typename
T
,
u
Int
NTX
,
u
Int
NTY
>
template
<
typename
T
,
Int
NTX
,
Int
NTY
>
__global__
void
AveragePooling_bp
(
T
*
d_input_features
,
T
*
d_output_features
,
u
Int
nPlanes
,
u
Int
input_stride
,
u
Int
output_stride
,
u
Int
*
rules
,
u
Int
nHot
,
Int
nPlanes
,
Int
input_stride
,
Int
output_stride
,
Int
*
rules
,
Int
nHot
,
T
alpha
)
{
__shared__
u
Int
r
[
NTY
*
2
];
for
(
u
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
__shared__
Int
r
[
NTY
*
2
];
for
(
Int
n
=
blockIdx
.
x
*
NTY
;
n
<
nHot
;
n
+=
gridDim
.
x
*
NTY
)
{
{
u
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
-
n
Hot
))
Int
i
=
threadIdx
.
x
+
NTX
*
threadIdx
.
y
;
if
(
i
<
NTY
*
2
and
i
<
2
*
(
n
Hot
-
n
))
r
[
i
]
=
rules
[
2
*
n
+
i
];
}
__syncthreads
();
if
(
n
+
threadIdx
.
y
<
nHot
)
{
u
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
u
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
u
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
Int
i
=
r
[
2
*
threadIdx
.
y
]
*
input_stride
;
Int
o
=
r
[
2
*
threadIdx
.
y
+
1
]
*
output_stride
;
for
(
Int
plane
=
threadIdx
.
x
;
plane
<
nPlanes
;
plane
+=
NTX
)
d_input_features
[
i
+
plane
]
+=
alpha
*
d_output_features
[
o
+
plane
];
}
__syncthreads
();
...
...
@@ -65,12 +65,12 @@ __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
}
template
<
typename
T
>
void
AveragePooling_BackwardPass
(
cudaStream_t
stream
,
T
*
d_
in
put_features
,
T
*
d_output_features
,
uInt
nPlanes
,
u
Int
in
put_stride
,
u
Int
output_stride
,
uInt
*
rules
,
u
Int
nHot
,
u
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
,
0
,
stream
>>>
(
void
cuda_
AveragePooling_BackwardPass
(
T
*
d_input_features
,
T
*
d_
out
put_features
,
Int
nPlanes
,
Int
input_stride
,
Int
out
put_stride
,
Int
*
rules
,
Int
nHot
,
Int
filterVolume
)
{
AveragePooling_bp
<
T
,
32
,
32
><<<
32
,
dim3
(
32
,
32
)
>>>
(
d_input_features
,
d_output_features
,
nPlanes
,
input_stride
,
output_stride
,
rules
,
nHot
,
1.0
/
filterVolume
);
}
#endif
/*
GPU
_AVERAGEPOOLING_H */
#endif
/*
CUDA
_AVERAGEPOOLING_H */
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment