Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ad3ac5cc
Commit
ad3ac5cc
authored
Nov 15, 2019
by
Chao Liu
Browse files
adding col2im
parent
895e87c1
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
956 additions
and
73 deletions
+956
-73
driver/CMakeLists.txt
driver/CMakeLists.txt
+7
-3
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
...volution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
+38
-2
driver/include/device_tensor.hpp
driver/include/device_tensor.hpp
+2
-2
driver/include/host_col2im.hpp
driver/include/host_col2im.hpp
+71
-0
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+54
-52
driver/src/col2im_driver.cu
driver/src/col2im_driver.cu
+386
-1
driver/src/col2im_driver.cu
driver/src/col2im_driver.cu
+386
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+12
-12
No files found.
driver/CMakeLists.txt
View file @
ad3ac5cc
...
...
@@ -15,10 +15,14 @@ install(TARGETS host LIBRARY DESTINATION lib)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
set
(
DRIVER_SOURCE src/conv_driver.cpp
)
set
(
CONV_SOURCE src/conv_driver.cpp
)
set
(
COL2IM_SOURCE src/col2im_driver.cpp
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
set
(
DRIVER_SOURCE src/conv_driver.cu
)
set
(
CONV_SOURCE src/conv_driver.cu
)
set
(
COL2IM_SOURCE src/col2im_driver.cu
)
endif
()
add_executable
(
conv
${
DRIVER_SOURCE
}
)
add_executable
(
conv
${
CONV_SOURCE
}
)
add_executable
(
col2im
${
COL2IM_SOURCE
}
)
target_link_libraries
(
conv PRIVATE host
)
target_link_libraries
(
col2im PRIVATE host
)
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
View file @
ad3ac5cc
...
...
@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256;
...
...
@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif
1
#elif
0
// BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
4
;
constexpr
index_t
GemmNRepeat
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
1
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
2
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
4
,
1
,
16
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
2
,
1
,
3
>
;
// [E, B, N1, N2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
1
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
16
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
2
;
#endif
constexpr
index_t
N1
=
GemmNRepeat
;
...
...
driver/include/device_tensor.hpp
View file @
ad3ac5cc
...
...
@@ -7,8 +7,8 @@
template
<
typename
ConstTensorDesc
,
std
::
size_t
...
Is
>
auto
make_TensorDescriptor_impl
(
ConstTensorDesc
,
std
::
integer_sequence
<
std
::
size_t
,
Is
...
>
)
{
std
::
initializer_list
<
std
::
size_t
>
lengths
=
{
ConstTensorDesc
::
GetLength
(
Is
)
...};
std
::
initializer_list
<
std
::
size_t
>
strides
=
{
ConstTensorDesc
::
GetStride
(
Is
)
...};
std
::
initializer_list
<
std
::
size_t
>
lengths
=
{
ConstTensorDesc
::
GetLength
s
()[
Is
]
...};
std
::
initializer_list
<
std
::
size_t
>
strides
=
{
ConstTensorDesc
::
GetStride
s
()[
Is
]
...};
return
TensorDescriptor
(
lengths
,
strides
);
}
...
...
driver/include/host_col2im.hpp
0 → 100644
View file @
ad3ac5cc
#pragma once
#include "tensor.hpp"
template
<
typename
T
,
typename
FilterSizes
,
typename
OutputSizes
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
LeftPads
,
typename
RightPads
>
void
host_col2im
(
const
Tensor
<
T
>&
in_eb
,
Tensor
<
T
>&
in_nchw
,
FilterSizes
,
OutputSizes
,
ConvStrides
,
ConvDilations
,
LeftPads
,
RightPads
)
{
using
namespace
ck
;
int
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
int
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
int
HI
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
int
WI
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
int
Y
=
FilterSizes
{}[
0
];
int
X
=
FilterSizes
{}[
1
];
int
HO
=
OutputSizes
{}[
0
];
int
WO
=
OutputSizes
{}[
1
];
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
double
v
=
0
;
for
(
int
y
=
0
;
y
<
Y
;
++
y
)
{
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
if
(
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
{
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
for
(
int
x
=
0
;
x
<
X
;
++
x
)
{
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
if
(
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
int
e
=
c
*
(
Y
*
X
)
+
y
*
X
+
x
;
int
b
=
n
*
(
HO
*
WO
)
+
ho
*
WO
+
wo
;
v
+=
in_eb
(
e
,
b
);
}
}
}
}
in_nchw
(
n
,
c
,
hi
,
wi
)
=
v
;
};
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
in_nchw
.
mDesc
.
GetLengths
()[
0
],
in_nchw
.
mDesc
.
GetLengths
()[
1
],
in_nchw
.
mDesc
.
GetLengths
()[
2
],
in_nchw
.
mDesc
.
GetLengths
()[
3
]);
f_par
(
std
::
thread
::
hardware_concurrency
());
}
driver/src/col2im_driver.cpp
View file @
ad3ac5cc
...
...
@@ -4,30 +4,35 @@
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "device.hpp"
#include "tensor_generator.hpp"
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_col2im.hpp"
//#include "device_col2im.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if
0
constexpr index_t N = 1
28
;
constexpr index_t C = 1
28
;
#if
1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr index_t K = 1
28
;
constexpr index_t Y =
1
;
constexpr index_t X =
7
;
constexpr
index_t
K
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using LeftPads = Sequence<
0
,
3
>;
using RightPads = Sequence<
0
,
3
>;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
...
...
@@ -298,43 +303,32 @@ int main(int argc, char* argv[])
using
RightPads
=
Sequence
<
0
,
3
>
;
#endif
#if 0
auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
print_sequence("LeftPads", LeftPads{});
print_sequence("RightPads", RightPads{});
print_sequence("ConvStrides", ConvStrides{});
print_sequence("ConvDilations", ConvDilations{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
using in_data_t = float;
using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
#else
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
auto
in_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
auto
in_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
Ho
*
Wo
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_eb_desc
,
std
::
cout
<<
"in_eb_desc: "
);
print_sequence
(
"FilterSizes"
,
FilterSizes
{});
print_sequence
(
"OutputSizes"
,
OutputSizes
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
#endif
Tensor
<
float
>
in_eb
(
make_TensorDescriptor
(
in_eb_desc
));
Tensor
<
float
>
in_nchw_host
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
in_nchw_device
(
make_TensorDescriptor
(
in_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -349,36 +343,44 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if 1
in_eb
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#else
in_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
device_col2im
(
in_nchw_desc
,
in_nchw
,
in_eb_desc
,
#if 0
device_col2im(in_eb_desc,
in_eb,
in_nchw_desc,
in_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if
(
do_verification
)
{
host_
direct_convolution
(
in_nchw
,
wei_kcyx
,
out_nkhw_host
,
ConvStrid
es
{},
ConvDilation
s
{},
LeftPad
s
{},
Righ
tPads
{}
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "
wei_kcyx: ", wei_kcyx
.mData, ",") << std::endl;
LogRange(std::cout << "
out
_n
k
hw_host
: ",
out
_n
k
hw_host.mData, ",") << std::endl;
LogRange(std::cout << "
out
_n
k
hw_device: ",
out
_n
k
hw_device.mData, ",") << std::endl;
host_
col2im
(
in_eb
,
in_nchw_host
,
FilterSizes
{}
,
OutputSiz
es
{},
ConvStride
s
{},
ConvDilation
s
{},
Lef
tPads
{}
,
RightPads
{});
check_error
(
in_nchw_host
,
in_nchw_device
);
#if 1
LogRange
(
std
::
cout
<<
"
in_eb : "
,
in_eb
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"
in
_n
c
hw_host : "
,
in
_n
c
hw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"
in
_n
c
hw_device
: "
,
in
_n
c
hw_device
.
mData
,
","
)
<<
std
::
endl
;
#endif
}
}
driver/src/col2im_driver.cu
deleted
120000 → 0
View file @
895e87c1
col2im_driver
.
cpp
\ No newline at end of file
driver/src/col2im_driver.cu
0 → 100644
View file @
ad3ac5cc
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "device.hpp"
#include "tensor_generator.hpp"
#include "device_tensor.hpp"
#include "conv_common.hpp"
#include "host_col2im.hpp"
//#include "device_col2im.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
#if 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1280
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 8x8 image
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 28x28 image
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 17x17 input
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
768
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 14x14 image
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
528
;
constexpr
index_t
HI
=
14
;
constexpr
index_t
WI
=
14
;
constexpr
index_t
K
=
256
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1 filter, 7x7 image
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
832
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
HI
=
35
;
constexpr
index_t
WI
=
35
;
constexpr
index_t
K
=
384
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 5x5 filter, 2x2 pad, 7x7 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
48
;
constexpr
index_t
HI
=
7
;
constexpr
index_t
WI
=
7
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
2
,
2
>
;
using
RightPads
=
Sequence
<
2
,
2
>
;
#elif 0
// 7x1 filter, 3x0 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif 1
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#endif
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
constexpr
auto
out_nkhw_desc
=
get_convolution_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
constexpr
index_t
HO
=
out_nkhw_desc
.
GetLengths
()[
2
];
constexpr
index_t
WO
=
out_nkhw_desc
.
GetLengths
()[
3
];
auto
in_eb_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
C
*
Y
*
X
,
N
*
HO
*
WO
>
{});
using
FilterSizes
=
Sequence
<
Y
,
X
>
;
using
OutputSizes
=
Sequence
<
HO
,
WO
>
;
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_eb_desc
,
std
::
cout
<<
"in_eb_desc: "
);
print_sequence
(
"FilterSizes"
,
FilterSizes
{});
print_sequence
(
"OutputSizes"
,
OutputSizes
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
Tensor
<
float
>
in_eb
(
make_TensorDescriptor
(
in_eb_desc
));
Tensor
<
float
>
in_nchw_host
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
float
>
in_nchw_device
(
make_TensorDescriptor
(
in_nchw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
argc
!=
3
)
{
printf
(
"arg1: do_verification, arg2: nrepeat
\n
"
);
exit
(
1
);
}
bool
do_verification
=
atoi
(
argv
[
1
]);
index_t
nrepeat
=
atoi
(
argv
[
2
]);
if
(
do_verification
)
{
#if 1
in_eb
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#else
in_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
}
#if 0
device_col2im(in_eb_desc,
in_eb,
in_nchw_desc,
in_nchw_device,
FilterSizes{},
OutputSizes{},
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if
(
do_verification
)
{
host_col2im
(
in_eb
,
in_nchw_host
,
FilterSizes
{},
OutputSizes
{},
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{});
check_error
(
in_nchw_host
,
in_nchw_device
);
#if 1
LogRange
(
std
::
cout
<<
"in_eb : "
,
in_eb
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"in_nchw_host : "
,
in_nchw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"in_nchw_device : "
,
in_nchw_device
.
mData
,
","
)
<<
std
::
endl
;
#endif
}
}
driver/src/conv_driver.cpp
View file @
ad3ac5cc
...
...
@@ -29,20 +29,20 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if
0
constexpr index_t N =
12
8;
constexpr index_t C =
128
;
constexpr index_t HI =
17
;
constexpr index_t WI =
17
;
constexpr index_t K =
128
;
constexpr index_t Y =
1
;
constexpr index_t X =
7
;
#if
1
constexpr
index_t
N
=
8
;
constexpr
index_t
C
=
32
;
constexpr
index_t
HI
=
28
;
constexpr
index_t
WI
=
28
;
constexpr
index_t
K
=
32
;
constexpr
index_t
Y
=
5
;
constexpr
index_t
X
=
5
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using ConvDilations = Sequence<
1
,
1
>;
using
ConvDilations
=
Sequence
<
2
,
2
>
;
using LeftPads = Sequence<0,
3
>;
using RightPads = Sequence<0,
3
>;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 34x34
constexpr
index_t
N
=
64
;
...
...
@@ -393,7 +393,7 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment