Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0924d5e5
Commit
0924d5e5
authored
Apr 14, 2021
by
Jing Zhang
Browse files
debugging
parent
5e127c69
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
36 additions
and
34 deletions
+36
-34
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+36
-34
No files found.
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
0924d5e5
...
...
@@ -355,7 +355,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
#endif
// output: register to global memory
#if
1
#if
0
{
constexpr auto HoPerThreadx2 = HoPerThread * 2;
constexpr auto WoPerThreadx2 = WoPerThread * 2;
...
...
@@ -480,12 +480,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
Number
<
HoPerThreadx2
>
{},
Number
<
WoPerThreadx2
>
{}));
constexpr
auto
vector_len
=
KPerThread
*
HoPerThreadx2
*
WoPerThreadx2
;
constexpr
auto
vector_len
=
d_k_n_hox2_wox2_thread_desc
.
GetElementSpaceSize
()
*
CThreadTransferDstScalarPerVector
;
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
vector_type
<
int8_t
,
vector_len
>
d_vec
;
auto
d_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatC
,
decltype
(
d_vec
),
decltype
(
d_k_n_hox2_wox2_global_desc
),
...
...
@@ -503,35 +505,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
make_multi_index
(
k_thread_data_on_global_add
,
0
,
hox2_thread_data_on_global
,
wox2_thread_data_on_global
));
auto
c_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
decltype
(
d_vec
),
FloatC
,
decltype
(
d_k_n_hox2_wox2_thread_desc
),
decltype
(
d_k_n_hox2_wox2_global_desc
),
Sequence
<
KPerThreadAdd
,
1
,
HoPerThreadx2
,
WoPerThreadx2
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
// CThreadTransferDstScalarPerVector,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryDataOperation
,
1
,
true
>
(
d_k_n_hox2_wox2_global_desc
,
make_multi_index
(
k_thread_data_on_global_add
,
0
,
hox2_thread_data_on_global
,
wox2_thread_data_on_global
));
d_threadwise_transfer
.
Run2
(
d_k_n_hox2_wox2_global_desc
,
wox2_thread_data_on_global
))
.
Run2
(
d_k_n_hox2_wox2_global_desc
,
p_d_global
,
d_k_n_hox2_wox2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
d_vec
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
static_assert
(
vector_len
==
256
,
""
);
static_for
<
0
,
vector_len
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
kpack_i
=
i
%
(
CThreadTransferDstScalarPerVector
);
constexpr
auto
khw_i
=
i
/
(
CThreadTransferDstScalarPerVector
);
...
...
@@ -545,7 +528,26 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
k_i
*
CThreadTransferDstScalarPerVector
+
kpack_i
,
0
,
h_i
/
2
,
w_i
/
2
))];
});
c_threadwise_transfer
.
Run2
(
d_k_n_hox2_wox2_thread_desc
,
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
decltype
(
d_vec
),
FloatC
,
decltype
(
d_k_n_hox2_wox2_thread_desc
),
decltype
(
d_k_n_hox2_wox2_global_desc
),
Sequence
<
KPerThreadAdd
,
1
,
HoPerThreadx2
,
WoPerThreadx2
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
// CThreadTransferDstScalarPerVector,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryDataOperation
,
1
,
true
>
(
d_k_n_hox2_wox2_global_desc
,
make_multi_index
(
k_thread_data_on_global_add
,
0
,
hox2_thread_data_on_global
,
wox2_thread_data_on_global
))
.
Run2
(
d_k_n_hox2_wox2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
d_vec
,
d_k_n_hox2_wox2_global_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment