Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4904281f
"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "18ffbd680273c5970ff1d105c1ee6fca99e0df88"
Commit
4904281f
authored
May 18, 2020
by
root
Browse files
try vector-load for 13x13 image
parent
7d09790a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
158 additions
and
4 deletions
+158
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+48
-0
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+78
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+32
-1
No files found.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
4904281f
...
...
@@ -97,6 +97,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src_long_vector
[
i
]
=
0
;
}
#if 0 //original code
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
...
...
@@ -120,6 +121,53 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
}
#else
//try vector load
// load data from src to the long-vector buffer
index_t
i
=
0
;
while
(
i
<
long_vector_size
){
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
i
;
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
scalar_id
(
vector_access_dim
)
=
i
+
3
;
const
auto
src_coord3
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
scalar_id
(
vector_access_dim
)
=
i
+
1
;
const
auto
src_coord1
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
if
(((
long_vector_size
-
i
)
>=
4
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
src_coord3
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
(
src_coord
.
GetOffset
()
+
3
==
src_coord3
.
GetOffset
())){
transfer_data
<
SrcData
,
4
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
i
=
i
+
4
;
}
else
if
(((
long_vector_size
-
i
)
>=
2
)
&&
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
src_coord1
.
IsOffsetValidAssumingUpperIndexIsValid
()
&&
((
src_coord
.
GetOffset
()
+
1
)
==
src_coord1
.
GetOffset
())){
transfer_data
<
SrcData
,
2
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
i
=
i
+
2
;
}
else
{
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if
(
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
())
{
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
}
i
++
;
}
}
#endif
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
4904281f
...
...
@@ -147,7 +147,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
4
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
4
;
#elif
1
#elif
0
// BlockSize = 256, GemmKPerBlock = 16
// for 1x1 filter, vector-read-b = 4
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -179,7 +179,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
4
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
4
;
#elif
1
#elif
0
// 1x1 filter, 14x14 image
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -210,6 +210,81 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
2
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
2
;
#elif 0
// 1x1 filter, 7x7 image
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
4
>
;
//4>;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
32
>
;
//32>;
/*
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); //jane:该维度需要vector-access多少次
*/
//using slicelength = Sequence<4, 32>;
//auto long_vector_access_lengths =
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
4
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
2
>
;
//;Sequence<8, 1>;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
4
,
32
>
;
//Sequence<2, 64>;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
2
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 1
// 1x1 filter, 13x13 image
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
16
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
//这个不起作用
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
//这个不起作用
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
4
>
;
//4>;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
32
>
;
//32>;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
4
;
#if 0 //vector load x1
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<8, 1>;//;Sequence<8, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<2, 64>;//Sequence<2, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
#elif
0
//vector load x2
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
2
>
;
//;Sequence<8, 1>;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
4
,
32
>
;
//Sequence<2, 64>;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
2
;
#elif 1 //vector load x4
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
2
,
4
>
;
//;Sequence<8, 1>;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
8
,
16
>
;
//Sequence<2, 64>;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
4
;
#endif
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#endif
constexpr
index_t
GemmM
=
K
;
...
...
@@ -237,11 +312,11 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
GemmKPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmKPerThreadLoop
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
...
...
driver/src/conv_driver.cpp
View file @
4904281f
...
...
@@ -29,7 +29,38 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if 1
#if 0
// 1x1 for vector memory access , 7x7 image size
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif
1
// 1x1 for vector memory access, 13x13 image size
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
//1024;
constexpr
index_t
HI
=
13
;
constexpr
index_t
WI
=
13
;
constexpr
index_t
K
=
256
;
//2048;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 1x1
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
64
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment