Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e131f6aa
Commit
e131f6aa
authored
Apr 18, 2020
by
Chao Liu
Browse files
refactor
parent
f64fab12
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
27 deletions
+27
-27
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+0
-10
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+27
-7
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
+0
-10
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
e131f6aa
...
...
@@ -164,7 +164,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
#if 0
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
...
...
@@ -172,15 +171,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
const
index_t
k_block_data_on_global
=
block_work_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
#else
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
BBlockWork
,
KBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
b_block_data_on_global
=
block_work_id
[
0
]
*
BPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_id
[
1
]
*
KPerBlock
;
#endif
// input tensor
// global tensor in global memory
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
e131f6aa
...
...
@@ -56,11 +56,6 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
#if 0
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
#endif
const
auto
thread_cluster_id
=
thread_cluster_desc
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
...
...
@@ -88,7 +83,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
if
(
get_thread_local_1d_id
()
<
thread_cluster_desc
.
GetElementSize
())
if
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
())
{
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
{
mThreadwiseLoad
.
Run_optimized_src_address_calculation
(
p_block_src
,
p_thread_buffer
);
}
else
{
mThreadwiseLoad
.
Run
(
p_block_src
,
p_thread_buffer
);
}
}
else
if
(
get_thread_local_1d_id
()
<
thread_cluster_desc
.
GetElementSize
())
{
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
...
...
@@ -112,7 +119,20 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
if
(
get_thread_local_1d_id
()
<
thread_cluster_desc
.
GetElementSize
())
if
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
())
{
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
{
mThreadwiseStore
.
Run_optimized_dst_address_calculation
(
p_thread_buffer
,
p_block_dst
);
}
else
{
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
);
}
}
else
if
(
get_thread_local_1d_id
()
<
thread_cluster_desc
.
GetElementSize
())
{
// TODO: threadwise copy is still being tweaked
if
(
has_optimized_address_calculation
)
...
...
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
View file @
e131f6aa
...
...
@@ -111,7 +111,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
constexpr
index_t
MBlockWork
=
M
/
MPerBlock
;
constexpr
index_t
NBlockWork
=
N
/
NPerBlock
;
#if 1
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
MBlockWork
,
NBlockWork
>
{});
...
...
@@ -119,15 +118,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
const
index_t
m_block_data_on_global
=
block_work_id
[
0
]
*
MPerBlock
;
const
index_t
n_block_data_on_global
=
block_work_id
[
1
]
*
NPerBlock
;
#else
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
NBlockWork
,
MBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
n_block_data_on_global
=
block_work_id
[
0
]
*
NPerBlock
;
const
index_t
m_block_data_on_global
=
block_work_id
[
1
]
*
MPerBlock
;
#endif
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment