Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8e8c6ea1
"...git@developer.sourcefind.cn:modelzoo/dbnet_pytorch.git" did not exist on "97243508511f3a4922b313f831a2038084d752a9"
Commit
8e8c6ea1
authored
Sep 19, 2023
by
aska-0096
Browse files
Sanity Pass. Inefficient ILP
parent
088a4f2b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
141 additions
and
74 deletions
+141
-74
example/01_gemm/gemm_xdl_fp16.cpp
example/01_gemm/gemm_xdl_fp16.cpp
+1
-1
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+13
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
...operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
+126
-73
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
...nsor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+1
-0
No files found.
example/01_gemm/gemm_xdl_fp16.cpp
View file @
8e8c6ea1
...
...
@@ -39,7 +39,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
25
6
,
2
56
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
6
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
6
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
6
4
,
3
2
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
1
6
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
1
6
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
>
;
// clang-format on
using
DeviceGemmInstance
=
DeviceGemmInstance1
;
...
...
example/01_gemm/run_gemm_example.inc
View file @
8e8c6ea1
...
...
@@ -76,6 +76,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
#if 0
printf
(
"A Matrix
\n
"
);
for
(
int
im
=
0
;
im
<
M
;
im
++
)
{
for
(
int
ik
=
0
;
ik
<
K
;
ik
++
)
{
if
(
ik
%
8
==
0
)
printf
(
"|"
);
printf
(
"%04x "
,
*
(
reinterpret_cast
<
uint16_t
*>
(
&
(
a_m_k
(
im
,
ik
)))));
}
printf
(
"
\n
"
);
}
#endif
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1.hpp
View file @
8e8c6ea1
...
...
@@ -333,15 +333,10 @@ struct BlockwiseGemmXdlops_pipeline_v1
CThreadBuffer
&
c_thread_buf
,
index_t
num_loop
)
const
{
auto
a_thread_
read_
buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
a_thread_desc_
.
GetElementSpaceSize
());
auto
a_thread_compute_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
a_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_read_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
b_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_compute_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
b_thread_desc_
.
GetElementSpaceSize
());
// preload data into LDS
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_grid_desc
,
b_grid_buf
);
...
...
@@ -361,8 +356,8 @@ struct BlockwiseGemmXdlops_pipeline_v1
make_tuple
(
m0
,
I0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
I
0
,
I0
,
I0
,
I0
),
a_thread_
read_
buf
);
make_tuple
(
m
0
,
I0
,
I0
,
I0
),
a_thread_buf
);
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
// read B
...
...
@@ -370,59 +365,64 @@ struct BlockwiseGemmXdlops_pipeline_v1
make_tuple
(
n0
,
I0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
I
0
,
I0
,
I0
,
I0
),
b_thread_
read_
buf
);
make_tuple
(
n
0
,
I0
,
I0
,
I0
),
b_thread_buf
);
});
});
// Initialize C
c_thread_buf
.
Clear
();
// main body
if
constexpr
(
HasMainLoop
)
{
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("HasMainLoop=True\n");
}
#endif
index_t
i
=
0
;
do
{
__builtin_amdgcn_sched_group_barrier
(
0x020
,
4
,
0
);
// VMEM read
__builtin_amdgcn_sched_group_barrier
(
0x100
,
2
,
0
);
// DS read
__builtin_amdgcn_sched_group_barrier
(
0x008
,
2
,
0
);
// MFMA
__builtin_amdgcn_sched_group_barrier
(
0x200
,
4
,
0
);
// DS write
__builtin_amdgcn_sched_group_barrier
(
0x008
,
2
,
0
);
// MFMA
__builtin_amdgcn_sched_group_barrier
(
0x100
,
2
,
0
);
// DS read
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
b_blockwise_copy
.
RunRead
(
b_grid_desc
,
b_grid_buf
);
// Here only KRepeat-1 times read (1~KRepeat) & compute (0~KRepat-1) of this k-loop
static_for
<
1
,
KRepeat
,
1
>
{}([
&
](
auto
k
)
{
// k=1,2 instead of kpack*1, ...
// Switch read/compute VGPR buffer
a_thread_compute_buf
=
a_thread_read_buf
;
b_thread_compute_buf
=
b_thread_read_buf
;
/* Read N+1 */
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
make_tuple
(
m0
,
I0
,
I0
,
Number
<
k
*
AMmaKStride
>
{}),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_read_buf
);
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
make_tuple
(
m0
,
I0
,
I0
,
Number
<
k
*
AMmaKStride
>
{}),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
m0
,
Number
<
k
%
2
>
{},
I0
,
I0
),
a_thread_buf
);
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
// read B
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_read_buf
);
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
n0
,
Number
<
k
%
2
>
{},
I0
,
I0
),
b_thread_buf
);
/* Compute N */
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
static_for
<
0
,
KPack
,
1
>
{}([
&
](
auto
ik
)
{
a_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
a_thread_
compute_
buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m
0
,
(
k
-
1
)
%
2
,
0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
b_thread_
compute_
buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
b_thread_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n
0
,
(
k
-
1
)
%
2
,
0
,
ik
))
>
{}];
});
using
mfma_input_type
=
...
...
@@ -452,9 +452,6 @@ struct BlockwiseGemmXdlops_pipeline_v1
// Wait all wave produce next k-loop data
block_sync_lds
();
// switch read/compute VGPR buffer
a_thread_compute_buf
=
a_thread_read_buf
;
b_thread_compute_buf
=
b_thread_read_buf
;
// Here 1 time read(idx=0) of next K-loop & compute(idx=KRepeat) this K-loop
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
...
...
@@ -462,8 +459,8 @@ struct BlockwiseGemmXdlops_pipeline_v1
make_tuple
(
m0
,
I0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
I
0
,
I0
,
I0
,
I0
),
a_thread_
read_
buf
);
make_tuple
(
m
0
,
I0
,
I0
,
I0
),
a_thread_buf
);
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
// read B
...
...
@@ -471,17 +468,19 @@ struct BlockwiseGemmXdlops_pipeline_v1
make_tuple
(
n0
,
I0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
I
0
,
I0
,
I0
,
I0
),
b_thread_
read_
buf
);
make_tuple
(
n
0
,
I0
,
I0
,
I0
),
b_thread_buf
);
/* Compute N */
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
static_for
<
0
,
KPack
,
1
>
{}([
&
](
auto
ik
)
{
a_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
a_thread_compute_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
b_thread_compute_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
a_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
(
KRepeat
-
1
)
%
2
,
0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
b_thread_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
(
KRepeat
-
1
)
%
2
,
0
,
ik
))
>
{}];
});
using
mfma_input_type
=
...
...
@@ -503,40 +502,64 @@ struct BlockwiseGemmXdlops_pipeline_v1
{
// Here only KRepeat-1 times read & compute
static_for
<
1
,
KRepeat
,
1
>
{}([
&
](
auto
k
)
{
// k=1,2 instead of kpack*1, ...
// switch read/compute VGPR buffer
a_thread_compute_buf
=
a_thread_read_buf
;
b_thread_compute_buf
=
b_thread_read_buf
;
/* Read N+1 */
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
make_tuple
(
m0
,
I0
,
I0
,
Number
<
k
*
AMmaKStride
>
{}),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_read_buf
);
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
make_tuple
(
m0
,
I0
,
I0
,
Number
<
k
*
AMmaKStride
>
{}),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
m0
,
Number
<
k
%
2
>
{},
I0
,
I0
),
a_thread_buf
);
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
// read B
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_read_buf
);
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
n0
,
Number
<
k
%
2
>
{},
I0
,
I0
),
b_thread_buf
);
/* Compute N */
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
static_for
<
0
,
KPack
,
1
>
{}([
&
](
auto
ik
)
{
a_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
a_thread_compute_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
b_thread_compute_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
ik
))
>
{}];
a_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
(
k
-
1
)
%
2
,
0
,
ik
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
ik
)
=
b_thread_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
(
k
-
1
)
%
2
,
0
,
ik
))
>
{}];
});
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("rep of m.n.k (%01d, %01d, %01d)\n",
m0.value,
n0.value,
k.value - 1);
}
printf(
"Tid: %03d, A_compute_buf: %04x %04x %04x %04x %04x %04x %04x %04x\n",
get_thread_local_1d_id(),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<0>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<1>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<2>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<3>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<4>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<5>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<6>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<7>{})))));
#endif
using
mfma_input_type
=
typename
vector_type
<
FloatAB
,
xdlops_gemm
.
K1PerXdlops
>::
type
;
...
...
@@ -557,12 +580,40 @@ struct BlockwiseGemmXdlops_pipeline_v1
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
static_for
<
0
,
KPack
,
1
>
{}([
&
](
auto
i
)
{
a_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
a_thread_compute_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
i
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
b_thread_compute_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
,
i
))
>
{}];
a_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
(
KRepeat
-
1
)
%
2
,
0
,
i
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
b_thread_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
(
KRepeat
-
1
)
%
2
,
0
,
i
))
>
{}];
});
#if 0
if(get_thread_local_1d_id() == 0)
{
printf(
"rep of m.n.k (%01d, %01d, %01d)\n", m0.value, n0.value, KRepeat - 1);
}
printf("Tid: %03d, A_compute_buf: %04x %04x %04x %04x %04x %04x %04x %04x\n",
get_thread_local_1d_id(),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<0>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<1>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<2>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<3>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<4>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<5>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<6>{})))),
*(reinterpret_cast<uint16_t*>(
&(a_thread_vec.template AsType<FloatAB>()(Number<7>{})))));
#endif
using
mfma_input_type
=
typename
vector_type
<
FloatAB
,
xdlops_gemm
.
K1PerXdlops
>::
type
;
...
...
@@ -579,13 +630,15 @@ struct BlockwiseGemmXdlops_pipeline_v1
}
protected:
// M1, N1 as double buffer index
// Read buffer + Compute buffer
// A[M0, M1, M2, KPack]
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I
1
,
I1
,
Number
<
KPack
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{}
,
I
2
,
I1
,
Number
<
KPack
>
{}));
// B[N0, N1, N2, KPack]
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I
1
,
I1
,
Number
<
KPack
>
{}));
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
NRepeat
>
{}
,
I
2
,
I1
,
Number
<
KPack
>
{}));
// C[M, N, NumRegXdlops]
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
View file @
8e8c6ea1
...
...
@@ -22,6 +22,7 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
// __attribute__((amdgpu_waves_per_eu(1, 1)))
kernel_gemm_xdl_cshuffle_v1
(
typename
GridwiseGemm
::
Argument
karg
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment