Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b6950a3c
Commit
b6950a3c
authored
Dec 14, 2021
by
Chao Liu
Browse files
debugging
parent
7b5b1c83
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
68 additions
and
11 deletions
+68
-11
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
+52
-9
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
...ensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+2
-2
example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
..._fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
+14
-0
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
View file @
b6950a3c
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_set.hpp"
#include "threadwise_tensor_slice_set.hpp"
#define DEBUG_USE_C_SHUFFLE
0
#define DEBUG_USE_C_SHUFFLE
1
namespace
ck
{
namespace
ck
{
...
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
MBlock
=
M
/
MPerBlock
;
const
auto
NBlock
=
N
/
NPerBlock
;
constexpr
index_t
MWave
=
MPerBlock
/
(
MRepeat
*
MPerXdl
);
constexpr
index_t
MWave
=
MPerBlock
/
(
MRepeat
*
MPerXdl
);
constexpr
index_t
NWave
=
NPerBlock
/
(
NRepeat
*
NPerXdl
);
constexpr
index_t
NWave
=
NPerBlock
/
(
NRepeat
*
NPerXdl
);
const
index_t
MBlock
=
M
/
(
MWave
*
MPerXdl
*
MRepeat
);
const
index_t
NBlock
=
N
/
(
NWave
*
NPerXdl
*
NRepeat
);
const
auto
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
=
const
auto
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
=
transform_tensor_descriptor
(
transform_tensor_descriptor
(
c_grid_desc_m_n
,
c_grid_desc_m_n
,
...
@@ -439,7 +439,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -439,7 +439,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
#if 0
AElementwiseOperation,
AElementwiseOperation,
#else
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
#endif
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
K0PerBlock
,
MPerBlock
,
K1
>
,
Sequence
<
K0PerBlock
,
MPerBlock
,
K1
>
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
...
@@ -467,7 +471,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -467,7 +471,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
#if 0
BElementwiseOperation,
BElementwiseOperation,
#else
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
#endif
InMemoryDataOperationEnum_t
::
Set
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
K0PerBlock
,
NPerBlock
,
K1
>
,
Sequence
<
K0PerBlock
,
NPerBlock
,
K1
>
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
...
@@ -654,7 +662,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -654,7 +662,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
FloatC
,
FloatC
,
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
#if 0
CElementwiseOperation,
CElementwiseOperation,
#else
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
#endif
Sequence
<
M0
,
N0
,
I1
,
I1
,
M2
,
I1
,
M4
,
I1
>
,
Sequence
<
M0
,
N0
,
I1
,
I1
,
M2
,
I1
,
M4
,
I1
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferSrcDstVectorDim
,
...
@@ -739,7 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -739,7 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
I1
,
I1
,
I1
,
I1
,
M2
,
M3
,
M4
,
N2
>
,
Sequence
<
I1
,
I1
,
I1
,
I1
,
M2
,
I1
,
M4
,
I1
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
7
,
7
,
1
,
1
,
...
@@ -758,8 +770,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -758,8 +770,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}};
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}};
// TODO: this is hardcoded, only works for BlockSize = 256. fix it!
// TODO: this is hardcoded, only works for BlockSize = 256. fix it!
constexpr
index_t
MThread_CCopy
=
16
;
constexpr
index_t
MThread_CCopy
=
32
;
constexpr
index_t
NThread_CCopy
=
16
;
constexpr
index_t
NThread_CCopy
=
8
;
constexpr
index_t
MPerThread_CCopy
=
MPerBlock_CCopy
/
MThread_CCopy
;
constexpr
index_t
MPerThread_CCopy
=
MPerBlock_CCopy
/
MThread_CCopy
;
constexpr
index_t
NPerThread_CCopy
=
NPerBlock_CCopy
/
NThread_CCopy
;
constexpr
index_t
NPerThread_CCopy
=
NPerBlock_CCopy
/
NThread_CCopy
;
...
@@ -768,6 +780,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -768,6 +780,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
make_naive_tensor_descriptor_packed
(
make_tuple
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
Number
<
MPerBlock_CCopy
>
{},
I1
,
I1
,
Number
<
NPerBlock_CCopy
>
{}));
I1
,
I1
,
Number
<
MPerBlock_CCopy
>
{},
I1
,
I1
,
Number
<
NPerBlock_CCopy
>
{}));
static_assert
(
c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
.
GetElementSpaceSize
()
==
64
*
64
,
"wrong!"
);
auto
c_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
auto
c_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
static_cast
<
FloatAcc
*>
(
p_shared
),
static_cast
<
FloatAcc
*>
(
p_shared
),
c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
...
@@ -789,8 +805,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -789,8 +805,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
>
,
// typename DstDimAccessOrder,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
>
,
// typename DstDimAccessOrder,
5
,
// index_t SrcVectorDim,
5
,
// index_t SrcVectorDim,
5
,
// index_t DstVectorDim,
5
,
// index_t DstVectorDim,
M
Thread_CCopy
,
// index_t SrcScalarPerVector,
NPer
Thread_CCopy
,
// index_t SrcScalarPerVector,
NThread_CCopy
,
// index_t DstScalarPerVector,
N
Per
Thread_CCopy
,
// index_t DstScalarPerVector,
1
,
// index_t SrcScalarStrideInVector,
1
,
// index_t SrcScalarStrideInVector,
1
,
// index_t DstScalarStrideInVector,
1
,
// index_t DstScalarStrideInVector,
true
,
// bool ThreadTransferSrcResetCoordinateAfterRun,
true
,
// bool ThreadTransferSrcResetCoordinateAfterRun,
...
@@ -827,6 +843,30 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -827,6 +843,30 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_block_buf
);
c_block_buf
);
#if 0
if(get_thread_local_1d_id() == 0)
{
for(int mwave = 0; mwave < MWave; ++mwave)
{
for(int mperxdl = 0; mperxdl < MPerXdl; ++mperxdl)
{
for(int nwave = 0; nwave < NWave; ++nwave)
{
for(int nperxdl = 0; nperxdl < NPerXdl; ++nperxdl)
{
int m = mwave * MPerXdl + mperxdl;
int n = nwave * NPerXdl + nperxdl;
int offset = m * NWave * NPerXdl + n;
c_block_buf(offset) = 10 * mwave + nwave;
}
}
}
}
}
#endif
// make sure ds_write from c_thread_copy_vgpr_to_lds is completed
// make sure ds_write from c_thread_copy_vgpr_to_lds is completed
block_sync_lds
();
block_sync_lds
();
...
@@ -837,6 +877,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
...
@@ -837,6 +877,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
,
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
,
c_grid_buf
);
c_grid_buf
);
// make sure ds_read from c_block_copy_lds_to_global is completed
block_sync_lds
();
// move on nrepeat dimension
// move on nrepeat dimension
if
constexpr
(
nrepeat_forward_sweep
&&
(
nrepeat
<
NRepeat
-
1
))
if
constexpr
(
nrepeat_forward_sweep
&&
(
nrepeat
<
NRepeat
-
1
))
{
{
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
View file @
b6950a3c
...
@@ -166,7 +166,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
...
@@ -166,7 +166,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
index_t
tmp
=
ordered_src_access_idx
[
I0
];
index_t
tmp
=
ordered_src_access_idx
[
I0
];
// TODO: BUG: should start at 1
// TODO: BUG: should start at 1
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_idx
[
j
];
tmp
=
tmp
*
ordered_src_access_lengths
[
j
]
+
ordered_src_access_idx
[
j
];
});
});
...
@@ -613,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
...
@@ -613,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
index_t
tmp
=
ordered_dst_access_lengths
[
I0
]
-
1
;
// TODO: BUG: should start at 1
// TODO: BUG: should start at 1
static_for
<
0
,
i
,
1
>
{}([
&
](
auto
j
)
{
static_for
<
1
,
i
,
1
>
{}([
&
](
auto
j
)
{
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
tmp
=
tmp
*
ordered_dst_access_lengths
[
j
]
+
ordered_dst_access_lengths
[
j
]
-
1
;
});
});
...
...
example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
View file @
b6950a3c
...
@@ -198,6 +198,10 @@ int main(int argc, char* argv[])
...
@@ -198,6 +198,10 @@ int main(int argc, char* argv[])
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{});
break
;
case
2
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
break
;
...
@@ -272,5 +276,15 @@ int main(int argc, char* argv[])
...
@@ -272,5 +276,15 @@ int main(int argc, char* argv[])
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in_n_c_hi_wi
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei_k_c_y_x
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_n_k_ho_wo_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_n_k_ho_wo_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment