Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c9a8e558
"git@developer.sourcefind.cn:wangsen/mineru.git" did not exist on "1fd72f5f3a0e32ffd3b772b923a460ab9e3d9b11"
Commit
c9a8e558
authored
Jul 20, 2019
by
Chao Liu
Browse files
adding tensor_view
parent
8669e242
Changes
56
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
104 additions
and
93 deletions
+104
-93
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+8
-1
driver/include/conv_common.hpp
driver/include/conv_common.hpp
+1
-1
driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
...r/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+3
-3
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...de/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+6
-6
driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...de/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+4
-4
driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
...de/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+6
-6
driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...de/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+4
-4
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+3
-3
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+3
-3
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
+7
-7
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
...device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+5
-5
driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
...ice_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+6
-6
driver/include/host_conv.hpp
driver/include/host_conv.hpp
+14
-14
driver/include/host_tensor.hpp
driver/include/host_tensor.hpp
+16
-16
driver/src/driver.cpp
driver/src/driver.cpp
+9
-6
driver/src/tensor.cpp
driver/src/tensor.cpp
+9
-8
No files found.
composable_kernel/include/utility/
S
equence.hpp
→
composable_kernel/include/utility/
s
equence.hpp
View file @
c9a8e558
...
...
@@ -433,7 +433,7 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
{
constexpr
index_t
nsize
=
Sequence
<
Xs
...
>::
GetSize
();
static_assert
(
nsize
<=
1
0
,
"wrong!"
);
static_assert
(
nsize
<=
1
2
,
"wrong!"
);
static_if
<
nsize
==
0
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {}
\n
"
,
s
,
nsize
,
Xs
...);
});
...
...
@@ -462,6 +462,13 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
static_if
<
nsize
==
10
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
11
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
12
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
}
}
// namespace ck
...
...
driver/include/conv_common.hpp
View file @
c9a8e558
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "
C
onstant
T
ensor
D
escriptor.hpp"
#include "
c
onstant
_t
ensor
_d
escriptor.hpp"
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
...
...
driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
View file @
c9a8e558
...
...
@@ -9,11 +9,11 @@ using namespace ck;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_direct_v2_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in
,
const
Host
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
const
Host
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
,
Host
Tensor
<
T
>&
out
,
index_t
nrepeat
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
c9a8e558
...
...
@@ -12,11 +12,11 @@ using namespace ck;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Host
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
...
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto
in_chwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Host
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
auto
f_reorder_nchw2chwn
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
in_chwn
(
c
,
hi
,
wi
,
n
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
...
...
@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Host
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
...
...
driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
c9a8e558
...
...
@@ -10,11 +10,11 @@ using namespace ck;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Host
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
...
...
driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
View file @
c9a8e558
...
...
@@ -10,11 +10,11 @@ using namespace ck;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto
in_chwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Host
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
make_ParallelTensorFunctor
(
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
in_chwn
(
c
,
hi
,
wi
,
n
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
},
...
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Host
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
make_ParallelTensorFunctor
(
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
},
...
...
@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto
out_khwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Host
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
#if 0
// 3x3, 34x34
...
...
driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
c9a8e558
...
...
@@ -8,11 +8,11 @@
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Host
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c9a8e558
...
...
@@ -13,11 +13,11 @@ template <class T,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
...
...
driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
View file @
c9a8e558
...
...
@@ -14,11 +14,11 @@ template <class T,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
...
...
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
View file @
c9a8e558
...
...
@@ -14,11 +14,11 @@ template <class T,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
...
...
@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
2
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
...
...
driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
View file @
c9a8e558
...
...
@@ -8,11 +8,11 @@ using namespace ck;
template
<
class
TInWei
,
class
TOut
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
TInWei
>&
in_nchw
,
const
Host
Tensor
<
TInWei
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
TInWei
>&
wei_kcyx
,
const
Host
Tensor
<
TInWei
>&
wei_kcyx
,
OutDesc
,
Tensor
<
TOut
>&
out_nkhw
,
Host
Tensor
<
TOut
>&
out_nkhw
,
index_t
nrepeat
)
{
// this suppose in / wei data type is int8x4
...
...
@@ -46,7 +46,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
auto
in_nchw_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
,
C
/
NVector
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_nchw_vec_desc
,
std
::
cout
<<
"in_nchw_vec_desc: "
);
Tensor
<
vector_mem_t
>
in_nchw_vec
(
make_TensorDescriptor
(
in_nchw_vec_desc
));
Host
Tensor
<
vector_mem_t
>
in_nchw_vec
(
make_TensorDescriptor
(
in_nchw_vec_desc
));
auto
f_vectorized_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
h
,
auto
w
)
{
#if 0
...
...
@@ -69,7 +69,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
auto
wei_kcyx_vec_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
C
/
NVector
,
Y
,
X
>
{});
ostream_ConstantTensorDescriptor
(
wei_kcyx_vec_desc
,
std
::
cout
<<
"wei_kcyx_vec_desc: "
);
Tensor
<
vector_mem_t
>
wei_kcyx_vec
(
make_TensorDescriptor
(
wei_kcyx_vec_desc
));
Host
Tensor
<
vector_mem_t
>
wei_kcyx_vec
(
make_TensorDescriptor
(
wei_kcyx_vec_desc
));
auto
f_vectorized_kcyx
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
#if 0
...
...
driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
View file @
c9a8e558
...
...
@@ -8,11 +8,11 @@ using namespace ck;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
LowerPads
,
class
UpperPads
>
void
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Host
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
const
Host
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Host
Tensor
<
T
>&
out_nkhw
,
LowerPads
,
UpperPads
,
index_t
nrepeat
)
...
...
@@ -42,7 +42,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Host
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
auto
f_reorder_kcyx2cyxk
=
[
&
](
auto
k
,
auto
c
,
auto
y
,
auto
x
)
{
wei_cyxk
(
c
,
y
,
x
,
k
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
...
...
@@ -55,7 +55,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto
in_chwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Host
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
auto
f_reorder_nchw2chwn
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
in_chwn
(
c
,
hi
,
wi
,
n
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
...
...
@@ -68,7 +68,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto
out_khwn_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Host
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_chwn_device_buf
(
data_sz
*
in_chwn
.
mDesc
.
GetElementSpace
());
...
...
driver/include/host_conv.hpp
View file @
c9a8e558
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "
C
onstant
T
ensor
D
escriptor.hpp"
#include "
c
onstant
_t
ensor
_d
escriptor.hpp"
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
...
...
@@ -42,7 +42,7 @@ auto make_TensorDescriptor(TConstTensorDesc)
std
::
initializer_list
<
index_t
>
strides
=
{
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
)};
return
TensorDescriptor
(
lengths
,
strides
);
return
Host
TensorDescriptor
(
lengths
,
strides
);
}
template
<
class
TIn
,
...
...
@@ -52,9 +52,9 @@ template <class TIn,
class
ConvDilations
,
class
LowerPads
,
class
UpperPads
>
void
host_direct_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
void
host_direct_convolution
(
const
Host
Tensor
<
TIn
>&
in_nchw
,
const
Host
Tensor
<
TWei
>&
wei_kcyx
,
Host
Tensor
<
TOut
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
LowerPads
,
...
...
@@ -99,9 +99,9 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
}
template
<
class
TIn
,
class
TWei
,
class
TOut
,
class
LowerPads
,
class
UpperPads
>
void
host_winograd_3x3_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
void
host_winograd_3x3_convolution
(
const
Host
Tensor
<
TIn
>&
in_nchw
,
const
Host
Tensor
<
TWei
>&
wei_kcyx
,
Host
Tensor
<
TOut
>&
out_nkhw
,
LowerPads
,
UpperPads
)
{
...
...
@@ -134,11 +134,11 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std
::
size_t
HTile
=
(
HO
+
HoPerTile
-
1
)
/
HoPerTile
;
std
::
size_t
WTile
=
(
WO
+
WoPerTile
-
1
)
/
WoPerTile
;
Tensor
<
double
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Tensor
<
double
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Tensor
<
double
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
Host
Tensor
<
double
>
in_hold
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Host
Tensor
<
double
>
in_transform
({
N
,
C
,
HTile
,
WTile
,
HiPerTile
,
WiPerTile
});
Host
Tensor
<
double
>
wei_transform
({
K
,
C
,
HiPerTile
,
WiPerTile
});
Host
Tensor
<
double
>
out_transform
({
N
,
K
,
HTile
,
WTile
,
HiPerTile
,
HiPerTile
});
Host
Tensor
<
double
>
out_hold
({
N
,
K
,
HTile
,
WTile
,
HoPerTile
,
WoPerTile
});
auto
f_in_hold
=
[
&
](
auto
n
,
auto
c
,
auto
htile
,
auto
wtile
)
{
for
(
int
j
=
0
;
j
<
HiPerTile
;
++
j
)
...
...
@@ -339,7 +339,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
}
template
<
class
T
>
void
check_error
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
void
check_error
(
const
Host
Tensor
<
T
>&
ref
,
const
Host
Tensor
<
T
>&
result
)
{
float
error
=
0
;
float
max_diff
=
-
1
;
...
...
driver/include/tensor.hpp
→
driver/include/
host_
tensor.hpp
View file @
c9a8e558
#ifndef TENSOR_HPP
#define TENSOR_HPP
#ifndef
HOST_
TENSOR_HPP
#define
HOST_
TENSOR_HPP
#include <thread>
#include <vector>
...
...
@@ -65,24 +65,24 @@ auto construct_f_unpack_args(F, T args)
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
}
struct
TensorDescriptor
struct
Host
TensorDescriptor
{
TensorDescriptor
()
=
delete
;
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
);
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
,
std
::
initializer_list
<
std
::
size_t
>
strides
);
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
);
Host
TensorDescriptor
()
=
delete
;
Host
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
);
Host
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
,
std
::
initializer_list
<
std
::
size_t
>
strides
);
Host
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
);
void
CalculateStrides
();
template
<
class
Range
>
TensorDescriptor
(
const
Range
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
Host
TensorDescriptor
(
const
Range
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
{
this
->
CalculateStrides
();
}
template
<
class
Range1
,
class
Range2
>
TensorDescriptor
(
const
Range1
&
lens
,
const
Range2
&
strides
)
Host
TensorDescriptor
(
const
Range1
&
lens
,
const
Range2
&
strides
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
{
}
...
...
@@ -185,25 +185,25 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
}
template
<
class
T
>
struct
Tensor
struct
Host
Tensor
{
template
<
class
X
>
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
Host
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
X
>
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
Host
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
X
,
class
Y
>
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
Host
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
:
mDesc
(
lens
,
strides
),
mData
(
mDesc
.
GetElementSpace
())
{
}
Tensor
(
const
TensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
Host
Tensor
(
const
Host
TensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
class
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
...
...
@@ -265,7 +265,7 @@ struct Tensor
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
TensorDescriptor
mDesc
;
Host
TensorDescriptor
mDesc
;
std
::
vector
<
T
>
mData
;
};
...
...
driver/src/driver.cpp
View file @
c9a8e558
...
...
@@ -4,7 +4,7 @@
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "
C
onstant
T
ensor
D
escriptor.hpp"
#include "
c
onstant
_t
ensor
_d
escriptor.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
...
...
@@ -473,10 +473,10 @@ int main(int argc, char* argv[])
using
in_data_t
=
float
;
using
out_data_t
=
float
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
Host
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Host
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Host
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Host
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -491,7 +491,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if
1
#if
0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
...
...
@@ -503,6 +503,9 @@ int main(int argc, char* argv[])
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
...
...
driver/src/tensor.cpp
View file @
c9a8e558
...
...
@@ -3,17 +3,18 @@
#include "tensor.hpp"
TensorDescriptor
::
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
)
:
mLens
(
lens
)
Host
TensorDescriptor
::
Host
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
)
:
mLens
(
lens
)
{
this
->
CalculateStrides
();
}
TensorDescriptor
::
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
)
HostTensorDescriptor
::
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
)
:
mLens
(
lens
),
mStrides
(
strides
)
{
}
void
TensorDescriptor
::
CalculateStrides
()
void
Host
TensorDescriptor
::
CalculateStrides
()
{
mStrides
.
clear
();
mStrides
.
resize
(
mLens
.
size
(),
0
);
...
...
@@ -25,21 +26,21 @@ void TensorDescriptor::CalculateStrides()
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
}
std
::
size_t
TensorDescriptor
::
GetNumOfDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
Host
TensorDescriptor
::
GetNumOfDimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
TensorDescriptor
::
GetElementSize
()
const
std
::
size_t
Host
TensorDescriptor
::
GetElementSize
()
const
{
assert
(
mLens
.
size
()
==
mStrides
.
size
());
return
std
::
accumulate
(
mLens
.
begin
(),
mLens
.
end
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
}
std
::
size_t
TensorDescriptor
::
GetElementSpace
()
const
std
::
size_t
Host
TensorDescriptor
::
GetElementSpace
()
const
{
auto
ls
=
mLens
|
boost
::
adaptors
::
transformed
([](
std
::
size_t
v
)
{
return
v
-
1
;
});
return
std
::
inner_product
(
ls
.
begin
(),
ls
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
})
+
1
;
}
const
std
::
vector
<
std
::
size_t
>&
TensorDescriptor
::
GetLengths
()
const
{
return
mLens
;
}
const
std
::
vector
<
std
::
size_t
>&
Host
TensorDescriptor
::
GetLengths
()
const
{
return
mLens
;
}
const
std
::
vector
<
std
::
size_t
>&
TensorDescriptor
::
GetStrides
()
const
{
return
mStrides
;
}
const
std
::
vector
<
std
::
size_t
>&
Host
TensorDescriptor
::
GetStrides
()
const
{
return
mStrides
;
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment