Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
888a0a95
"...composable_kernel.git" did not exist on "20a672d0b836cac308518c41a78d486dce6d8e09"
Commit
888a0a95
authored
Jun 01, 2021
by
Chao Liu
Browse files
fixed perf issue
parent
4774d863
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
62 deletions
+10
-62
composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
...m_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+1
-2
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v1r2.hpp
...l/include/tensor_operation/gridwise_dynamic_gemm_v1r2.hpp
+9
-39
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
...nvolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+0
-21
No files found.
composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
View file @
888a0a95
...
@@ -93,8 +93,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
...
@@ -93,8 +93,7 @@ transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
// output tensor
// output tensor
const
auto
out_gemmm_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
out_gemmm_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K
,
Ho
*
Wo
)),
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K
,
Ho
*
Wo
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_tuple
(
make_pass_through_transform
(
K
),
make_merge_transform
(
make_tuple
(
N
,
Ho
*
Wo
))),
make_merge_transform
(
make_tuple
(
N
,
Ho
*
Wo
))),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v1r2.hpp
View file @
888a0a95
...
@@ -206,12 +206,16 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
...
@@ -206,12 +206,16 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
const
auto
block_work_idx
=
const
auto
block_work_idx
=
c_block_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
c_block_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
// HACK: this force m/n_block_data_idx_on_global into SGPR
// HACK: this force index data into SGPR
const
index_t
m_block_work_idx
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]);
const
index_t
n_block_work_idx
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]);
const
index_t
m_block_data_idx_on_global
=
const
index_t
m_block_data_idx_on_global
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]
*
MPerBlock
);
__builtin_amdgcn_readfirstlane
(
m_
block_work_idx
*
MPerBlock
);
const
index_t
n_block_data_idx_on_global
=
const
index_t
n_block_data_idx_on_global
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
NPerBlock
);
__builtin_amdgcn_readfirstlane
(
n_
block_work_idx
*
NPerBlock
);
// lds max alignment
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_M
>
{},
...
@@ -470,39 +474,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
...
@@ -470,39 +474,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
// output: register to global memory
// output: register to global memory
{
{
#if 0
constexpr auto M11 = Number<M1PerThread * M1N1ThreadClusterM100 * M1N1ThreadClusterM101>{};
constexpr auto N11 = Number<N1PerThread * M1N1ThreadClusterN100 * M1N1ThreadClusterN101>{};
// hack to control index calculation when iterating over c_m10_n10_m11_n11_global tensor
constexpr auto c_m10_n10_m11_n11_global_tensor_iterator_hacks = CGridIteratorHacks{};
const auto c_thread_data_idx_on_block =
blockwise_gemm.CalculateCM0M1N0N1ThreadOriginIndex(get_thread_local_1d_id());
ThreadwiseDynamicTensorSliceTransfer_v1r3<FloatAcc,
FloatC,
decltype(c_m10_n10_m11_n11_thread_desc),
decltype(c_m10_n10_m11_n11_grid_desc),
decltype(c_m10_n10_m11_n11_thread_tensor_lengths),
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
CGlobalMemoryDataOperation,
1,
true>{
c_m10_n10_m11_n11_grid_desc,
make_multi_index(m_block_data_idx_on_global / M11 + c_thread_data_idx_on_block[I0],
c_thread_data_idx_on_block[I1],
n_block_data_idx_on_global / N11 + c_thread_data_idx_on_block[I2],
c_thread_data_idx_on_block[I3])}
.Run(c_m10_n10_m11_n11_thread_desc,
make_tuple(I0, I0, I0, I0),
c_thread_buf,
c_m10_n10_m11_n11_grid_desc,
c_grid_buf,
c_m10_n10_m11_n11_global_tensor_iterator_hacks);
#else
constexpr
index_t
M11
=
M1PerThread
*
M1N1ThreadClusterM100
*
M1N1ThreadClusterM101
;
constexpr
index_t
M11
=
M1PerThread
*
M1N1ThreadClusterM100
*
M1N1ThreadClusterM101
;
constexpr
index_t
N11
=
N1PerThread
*
M1N1ThreadClusterN100
*
M1N1ThreadClusterN101
;
constexpr
index_t
N11
=
N1PerThread
*
M1N1ThreadClusterN100
*
M1N1ThreadClusterN101
;
...
@@ -541,10 +512,10 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
...
@@ -541,10 +512,10 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
CGlobalMemoryDataOperation
,
CGlobalMemoryDataOperation
,
1
,
1
,
true
>
{
c_m0_m10_m11_n0_n10_n11_grid_desc
,
true
>
{
c_m0_m10_m11_n0_n10_n11_grid_desc
,
make_multi_index
(
_
_builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
])
,
make_multi_index
(
m
_block_work_idx
,
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I0
],
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I0
],
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I1
],
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I1
],
_
_builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
])
,
n
_block_work_idx
,
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I2
],
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I2
],
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I3
])}
c_m10_m11_n10_n11_thread_origin_idx_on_block
[
I3
])}
.
Run
(
c_m0_m10_m11_n0_n10_n11_thread_desc
,
.
Run
(
c_m0_m10_m11_n0_n10_n11_thread_desc
,
...
@@ -553,7 +524,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
...
@@ -553,7 +524,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1r2
c_m0_m10_m11_n0_n10_n11_grid_desc
,
c_m0_m10_m11_n0_n10_n11_grid_desc
,
c_grid_buf
,
c_grid_buf
,
CGridIteratorHacks
{});
CGridIteratorHacks
{});
#endif
}
}
}
}
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
View file @
888a0a95
...
@@ -499,18 +499,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
...
@@ -499,18 +499,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
constexpr
auto
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
=
constexpr
auto
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{};
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{};
#if 0
// hack to control index calculation when iterating over out_gemmm0_gemmm1_gemmn0_gemmn1_grid
constexpr auto out_gemmm0_gemmm1_gemmn0_gemmn1_grid_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{}));
#else
constexpr
auto
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
=
constexpr
auto
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
...
@@ -524,7 +512,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
...
@@ -524,7 +512,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
#endif
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
{
...
@@ -569,11 +556,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
...
@@ -569,11 +556,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
GemmCThreadTransferDstScalarPerVector_GemmN1
,
GemmCThreadTransferDstScalarPerVector_GemmN1
,
decltype
(
wei_gemmk_gemmm_grid_iterator_hacks
),
decltype
(
wei_gemmk_gemmm_grid_iterator_hacks
),
decltype
(
in_gemmk_gemmn_grid_iterator_hacks
),
decltype
(
in_gemmk_gemmn_grid_iterator_hacks
),
#if 0
decltype(out_gemmm0_gemmm1_gemmn0_gemmn1_grid_iterator_hacks),
#else
decltype
(
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
),
decltype
(
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
),
#endif
decltype
(
wei_gemmk_gemmm_grid_move_slice_window_iterator_hacks
),
decltype
(
wei_gemmk_gemmm_grid_move_slice_window_iterator_hacks
),
decltype
(
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
)
>
(
decltype
(
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
)
>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
...
@@ -586,11 +569,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
...
@@ -586,11 +569,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nchw_kcyx_nkhw(
out_gemmm_gemmn_grid_desc
,
out_gemmm_gemmn_grid_desc
,
wei_gemmk_gemmm_grid_iterator_hacks
,
wei_gemmk_gemmm_grid_iterator_hacks
,
in_gemmk_gemmn_grid_iterator_hacks
,
in_gemmk_gemmn_grid_iterator_hacks
,
#if 0
out_gemmm0_gemmm1_gemmn0_gemmn1_grid_iterator_hacks,
#else
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
,
out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_iterator_hacks
,
#endif
wei_gemmk_gemmm_grid_move_slice_window_iterator_hacks
,
wei_gemmk_gemmm_grid_move_slice_window_iterator_hacks
,
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
,
in_gemmk_gemmn_grid_move_slice_window_iterator_hacks
,
nrepeat
);
nrepeat
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment