Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
25b71afc
Commit
25b71afc
authored
Apr 14, 2021
by
Jing Zhang
Browse files
debugging add nest loops
parent
4500596a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
43 additions
and
35 deletions
+43
-35
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+31
-23
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+12
-12
No files found.
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
25b71afc
...
@@ -159,6 +159,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
...
@@ -159,6 +159,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
ABlockTransferSrcScalarPerVector
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
>
{};
ABlockTransferDstScalarPerVector_K
>
{};
// register allocation for output
FloatAcc
p_c_thread
[
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()];
// zero out threadwise output
threadwise_matrix_set_zero_v3
(
c_k_n_ho_wo_thread_desc
,
p_c_thread
);
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
...
@@ -226,12 +232,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
...
@@ -226,12 +232,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
FloatAB
*
p_a_block
=
p_shared_block
;
FloatAB
*
p_a_block
=
p_shared_block
;
// register allocation for output
FloatAcc
p_c_thread
[
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()];
// zero out threadwise output
threadwise_matrix_set_zero_v3
(
c_k_n_ho_wo_thread_desc
,
p_c_thread
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
// hack to control index calculation when iterating over A and B matrix for threadwise copy
...
@@ -514,29 +514,37 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
...
@@ -514,29 +514,37 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
d_vec
,
d_vec
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
c_k_n_ho_wo_global_tensor_iterator_hacks
);
#if 1
static_for
<
0
,
KPerThreadAdd
,
1
>
{}([
&
](
auto
k_i
)
{
static_for
<
0
,
HoPerThreadx2
,
1
>
{}([
&
](
auto
h_i
)
{
static_for
<
0
,
WoPerThreadx2
,
1
>
{}([
&
](
auto
w_i
)
{
vector_type
<
int8_t
,
CThreadTransferDstScalarPerVector
>
t
;
static_for
<
0
,
d_k_n_hox2_wox2_thread_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
j
)
{
// t.template AsType<FloatC>()(Number<0>{}) = d_vec.template AsType<
vector_type
<
int8_t
,
CThreadTransferDstScalarPerVector
>
t
;
// FloatC>()[Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(
// make_tuple(k_i, 0, h_i, w_i))>{}];
constexpr
auto
k_i
=
j
/
(
HoPerThreadx2
*
WoPerThreadx2
);
t
.
template
AsType
<
FloatC
>()(
Number
<
0
>
{})
=
constexpr
auto
hw_i
=
j
%
(
HoPerThreadx2
*
WoPerThreadx2
);
d_vec
[
Number
<
d_k_n_hox2_wox2_thread_desc
.
CalculateOffset
(
constexpr
auto
h_i
=
hw_i
/
WoPerThreadx2
;
make_tuple
(
k_i
,
0
,
h_i
,
w_i
))
>
{}];
constexpr
auto
w_i
=
hw_i
%
WoPerThreadx2
;
// t.template AsType<FloatC>()(Number<0>{}) = d_vec.template AsType<FloatC>()[j];
static_for
<
0
,
CThreadTransferDstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
t
.
template
AsType
<
FloatC
>()(
Number
<
0
>
{})
=
d_vec
[
j
];
t
.
template
AsType
<
int8_t
>()(
i
)
+=
p_c_thread
[
c_k_n_ho_wo_thread_desc
.
CalculateOffset
(
make_tuple
(
k_i
*
CThreadTransferDstScalarPerVector
+
i
,
0
,
h_i
/
2
,
w_i
/
2
))];
});
static_for
<
0
,
CThreadTransferDstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
// d_vec.template AsType<FloatC>()(
t
.
template
AsType
<
int8_t
>()(
i
)
+=
// Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(make_tuple(
p_c_thread
[
c_k_n_ho_wo_thread_desc
.
CalculateOffset
(
make_tuple
(
// k_i, 0, h_i, w_i))>{}) = t.template AsType<FloatC>()[Number<0>{}];
k_i
*
CThreadTransferDstScalarPerVector
+
i
,
0
,
h_i
/
2
,
w_i
/
2
))];
});
// d_vec.template AsType<FloatC>()(j) = t.template AsType<FloatC>()[Number<0>{}];
d_vec
[
Number
<
d_k_n_hox2_wox2_thread_desc
.
CalculateOffset
(
make_tuple
(
d_vec
[
j
]
=
t
.
template
AsType
<
FloatC
>()[
Number
<
0
>
{}];
k_i
,
0
,
h_i
,
w_i
))
>
{}]
=
t
.
template
AsType
<
FloatC
>()[
Number
<
0
>
{}];
});
});
});
});
#endif
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
// decltype(d_vec),
// decltype(d_vec),
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
25b71afc
...
@@ -265,12 +265,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -265,12 +265,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
}
}
template
<
typename
SrcSliceOriginIdx
,
typename
DstIteratorHacks
>
template
<
typename
SrcSliceOriginIdx
,
typename
DstIteratorHacks
>
__device__
void
Run
2
(
const
SrcDesc
&
,
__device__
void
Run
(
const
SrcDesc
&
,
const
SrcSliceOriginIdx
&
,
const
SrcSliceOriginIdx
&
,
const
SrcData
&
p_src
,
const
SrcData
&
p_src
,
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
,
DstData
*
p_dst
,
const
DstIteratorHacks
&
dst_iterator_hacks
)
const
DstIteratorHacks
&
dst_iterator_hacks
)
{
{
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
(),
static_assert
(
SrcDesc
::
IsKnownAtCompileTime
(),
"wrong! SrcDesc need to known at compile-time"
);
"wrong! SrcDesc need to known at compile-time"
);
...
@@ -785,12 +785,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
...
@@ -785,12 +785,12 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
}
}
template
<
typename
DstSliceOriginIdx
,
typename
SrcIteratorHacks
>
template
<
typename
DstSliceOriginIdx
,
typename
SrcIteratorHacks
>
__device__
void
Run
2
(
const
SrcDesc
&
src_desc
,
__device__
void
Run
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
SrcData
*
p_src
,
const
DstDesc
&
,
const
DstDesc
&
,
const
DstSliceOriginIdx
&
,
const
DstSliceOriginIdx
&
,
DstData
&
p_dst
,
DstData
&
p_dst
,
const
SrcIteratorHacks
&
src_iterator_hacks
)
const
SrcIteratorHacks
&
src_iterator_hacks
)
{
{
static_assert
(
DstDesc
::
IsKnownAtCompileTime
(),
static_assert
(
DstDesc
::
IsKnownAtCompileTime
(),
"wrong! DstDesc need to known at compile-time"
);
"wrong! DstDesc need to known at compile-time"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment