Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
77e63b34
Commit
77e63b34
authored
Sep 01, 2023
by
Bartlomiej Wroblewski
Browse files
Review: Introduce AK0 and BK0 instead of a single K0
parent
03cd2692
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
191 additions
and
162 deletions
+191
-162
example/01_gemm/gemm_dpp_fp16.cpp
example/01_gemm/gemm_dpp_fp16.cpp
+5
-5
include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
...e/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
+10
-7
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
+130
-104
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
...pu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
+11
-11
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
...pu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
+11
-11
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
...pu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
+11
-11
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
...pu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
+11
-11
No files found.
example/01_gemm/gemm_dpp_fp16.cpp
View file @
77e63b34
...
@@ -24,11 +24,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
...
@@ -24,11 +24,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmDpp
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmDpp
// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K
0
Per| K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer|
KPer|
AK1| B
K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block|
|
| Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ######| | | | | | | | Operation| Operation| Operation| | | | | |
|
| | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ######| | | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | |
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
128
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
;
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
128
,
64
,
64
,
64
,
8
,
2
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
2
,
2
,
true
,
5
,
1
>
;
// // clang-format on
// // clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
View file @
77e63b34
...
@@ -29,7 +29,7 @@ template <index_t BlockSize,
...
@@ -29,7 +29,7 @@ template <index_t BlockSize,
index_t
MRepeat
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
NRepeat
,
index_t
KPack
>
index_t
KPack
>
struct
BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
struct
BlockwiseGemmDpp_
a
k0m
a
k1_
b
k0n
b
k1_m0n0m1n1m2n2
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -127,7 +127,7 @@ struct BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
...
@@ -127,7 +127,7 @@ struct BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
return
make_tuple
(
c_thread_m
,
c_thread_n
);
return
make_tuple
(
c_thread_m
,
c_thread_n
);
}
}
__host__
__device__
BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
()
__host__
__device__
BlockwiseGemmDpp_
a
k0m
a
k1_
b
k0n
b
k1_m0n0m1n1m2n2
()
{
{
static_assert
(
AK0MK1BlockDesc
::
IsKnownAtCompileTime
()
&&
static_assert
(
AK0MK1BlockDesc
::
IsKnownAtCompileTime
()
&&
BK0NK1BlockDesc
::
IsKnownAtCompileTime
(),
BK0NK1BlockDesc
::
IsKnownAtCompileTime
(),
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
View file @
77e63b34
...
@@ -33,8 +33,9 @@ template <typename ADataType,
...
@@ -33,8 +33,9 @@ template <typename ADataType,
ck
::
index_t
BlockSize
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
NPerBlock
,
ck
::
index_t
NPerBlock
,
ck
::
index_t
K0PerBlock
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
K1
,
ck
::
index_t
AK1
,
ck
::
index_t
BK1
,
ck
::
index_t
MPerDpp
,
ck
::
index_t
MPerDpp
,
ck
::
index_t
NPerDpp
,
ck
::
index_t
NPerDpp
,
ck
::
index_t
MDppPerWave
,
ck
::
index_t
MDppPerWave
,
...
@@ -67,7 +68,7 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
...
@@ -67,7 +68,7 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
BElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
CElementwiseOperation
>
{
{
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_dpp
<
using
GridwiseGemm
=
GridwiseGemm_
a
k0m
a
k1_
b
k0n
b
k1_mn_dpp
<
BlockSize
,
BlockSize
,
ADataType
,
ADataType
,
AccDataType
,
AccDataType
,
...
@@ -82,10 +83,11 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
...
@@ -82,10 +83,11 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
GemmSpec
,
GemmSpec
,
MPerBlock
,
MPerBlock
,
NPerBlock
,
NPerBlock
,
K
0
PerBlock
,
KPerBlock
,
MPerDpp
,
MPerDpp
,
NPerDpp
,
NPerDpp
,
K1
,
AK1
,
BK1
,
MDppPerWave
,
MDppPerWave
,
NDppPerWave
,
NDppPerWave
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
...
@@ -242,8 +244,9 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
...
@@ -242,8 +244,9 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
<<
BlockSize
<<
", "
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
K0PerBlock
<<
", "
<<
KPerBlock
<<
", "
<<
K1
<<
", "
<<
AK1
<<
", "
<<
BK1
<<
", "
<<
MPerDpp
<<
", "
<<
MPerDpp
<<
", "
<<
NPerDpp
<<
", "
<<
NPerDpp
<<
", "
<<
MDppPerWave
<<
", "
<<
MDppPerWave
<<
", "
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
View file @
77e63b34
...
@@ -31,10 +31,10 @@ __global__ void
...
@@ -31,10 +31,10 @@ __global__ void
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1030__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1030__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
const
auto
a_grid_desc_k0_m_k1
=
amd_wave_read_first_lane
(
const
auto
a_grid_desc_
a
k0_m_
a
k1
=
amd_wave_read_first_lane
(
GridwiseGemm
::
MakeAGridDescriptor_K0_M_K1
(
karg
.
M
,
karg
.
K
,
karg
.
K0
,
karg
.
StrideA
));
GridwiseGemm
::
MakeAGridDescriptor_
A
K0_M_
A
K1
(
karg
.
M
,
karg
.
K
,
karg
.
A
K0
,
karg
.
StrideA
));
const
auto
b_grid_desc_k0_n_k1
=
amd_wave_read_first_lane
(
const
auto
b_grid_desc_
b
k0_n_
b
k1
=
amd_wave_read_first_lane
(
GridwiseGemm
::
MakeBGridDescriptor_K0_N_K1
(
karg
.
K
,
karg
.
N
,
karg
.
K0
,
karg
.
StrideB
));
GridwiseGemm
::
MakeBGridDescriptor_
B
K0_N_
B
K1
(
karg
.
K
,
karg
.
N
,
karg
.
B
K0
,
karg
.
StrideB
));
const
auto
c_grid_desc_m_n
=
amd_wave_read_first_lane
(
const
auto
c_grid_desc_m_n
=
amd_wave_read_first_lane
(
GridwiseGemm
::
MakeCGridDescriptor_M_N
(
karg
.
M
,
karg
.
N
,
karg
.
StrideC
));
GridwiseGemm
::
MakeCGridDescriptor_M_N
(
karg
.
M
,
karg
.
N
,
karg
.
StrideC
));
...
@@ -42,8 +42,8 @@ __global__ void
...
@@ -42,8 +42,8 @@ __global__ void
karg
.
p_b_grid
,
karg
.
p_b_grid
,
karg
.
p_c_grid
,
karg
.
p_c_grid
,
p_shared
,
p_shared
,
a_grid_desc_k0_m_k1
,
a_grid_desc_
a
k0_m_
a
k1
,
b_grid_desc_k0_n_k1
,
b_grid_desc_
b
k0_n_
b
k1
,
c_grid_desc_m_n
);
c_grid_desc_m_n
);
#else
#else
ignore
=
karg
;
ignore
=
karg
;
...
@@ -64,10 +64,11 @@ template <index_t BlockSize,
...
@@ -64,10 +64,11 @@ template <index_t BlockSize,
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
index_t
MPerBlock
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
NPerBlock
,
index_t
K
0
PerBlock
,
index_t
KPerBlock
,
index_t
MPerDpp
,
index_t
MPerDpp
,
index_t
NPerDpp
,
index_t
NPerDpp
,
index_t
K1Value
,
index_t
AK1Value
,
index_t
BK1Value
,
index_t
MDppPerWave
,
index_t
MDppPerWave
,
index_t
NDppPerWave
,
index_t
NDppPerWave
,
typename
ABlockTransferThreadClusterLengths_K0_M_K1
,
typename
ABlockTransferThreadClusterLengths_K0_M_K1
,
...
@@ -91,7 +92,7 @@ template <index_t BlockSize,
...
@@ -91,7 +92,7 @@ template <index_t BlockSize,
index_t
CThreadTransferDstScalarPerVector
,
index_t
CThreadTransferDstScalarPerVector
,
index_t
NumGemmKPrefetchStage
=
1
,
index_t
NumGemmKPrefetchStage
=
1
,
PipelineVersion
PipelineVer
=
PipelineVersion
::
v1
>
PipelineVersion
PipelineVer
=
PipelineVersion
::
v1
>
struct
GridwiseGemm_k0mk1_k0nk1_mn_dpp
struct
GridwiseGemm_
a
k0m
a
k1_
b
k0n
b
k1_mn_dpp
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -100,7 +101,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -100,7 +101,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
K1
=
Number
<
K1Value
>
{};
static
constexpr
auto
AK1
=
Number
<
AK1Value
>
{};
static
constexpr
auto
BK1
=
Number
<
BK1Value
>
{};
static
constexpr
auto
AK0PerBlock
=
Number
<
KPerBlock
/
AK1Value
>
{};
static
constexpr
auto
BK0PerBlock
=
Number
<
KPerBlock
/
BK1Value
>
{};
static
constexpr
auto
max_lds_align
=
math
::
lcm
(
AK1
,
BK1
);
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
// return block_id to C matrix tile idx (m0, n0) mapping
// return block_id to C matrix tile idx (m0, n0) mapping
...
@@ -121,7 +127,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -121,7 +127,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
return
math
::
integer_divide_ceil
(
N
,
NPerBlock
)
*
NPerBlock
;
return
math
::
integer_divide_ceil
(
N
,
NPerBlock
)
*
NPerBlock
;
}
}
__host__
static
auto
CalculateK0
(
index_t
K
)
{
return
math
::
integer_divide_floor
(
K
,
K1Value
);
}
__host__
static
auto
CalculateAK0
(
index_t
K
)
{
return
math
::
integer_divide_floor
(
K
,
AK1Value
);
}
__host__
static
auto
CalculateBK0
(
index_t
K
)
{
return
math
::
integer_divide_floor
(
K
,
BK1Value
);
}
// Argument
// Argument
struct
Problem
struct
Problem
...
@@ -140,7 +147,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -140,7 +147,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
StrideC
{
StrideC_
},
StrideC
{
StrideC_
},
MPadded
{
CalculateMPadded
(
M_
)},
MPadded
{
CalculateMPadded
(
M_
)},
NPadded
{
CalculateNPadded
(
N_
)},
NPadded
{
CalculateNPadded
(
N_
)},
K0
{
CalculateK0
(
K
)}
AK0
{
CalculateAK0
(
K
)},
BK0
{
CalculateBK0
(
K
)}
{
{
}
}
...
@@ -155,7 +163,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -155,7 +163,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
<<
"SC:"
<<
StrideC
<<
", "
<<
"SC:"
<<
StrideC
<<
", "
<<
"MP:"
<<
MPadded
<<
", "
<<
"MP:"
<<
MPadded
<<
", "
<<
"NP:"
<<
NPadded
<<
", "
<<
"NP:"
<<
NPadded
<<
", "
<<
"K0:"
<<
K0
<<
"}"
<<
std
::
endl
;
<<
"AK0:"
<<
AK0
<<
", "
<<
"BK0:"
<<
BK0
<<
"}"
<<
std
::
endl
;
}
}
index_t
M
;
index_t
M
;
...
@@ -166,7 +175,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -166,7 +175,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
index_t
StrideC
;
index_t
StrideC
;
index_t
MPadded
;
index_t
MPadded
;
index_t
NPadded
;
index_t
NPadded
;
index_t
K0
;
index_t
AK0
;
index_t
BK0
;
};
};
// Argument
// Argument
...
@@ -196,69 +206,66 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -196,69 +206,66 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
using
GridwiseGemmPipe
=
remove_cvref_t
<
using
GridwiseGemmPipe
=
remove_cvref_t
<
decltype
(
GridwiseGemmPipeline_Selector
<
PipelineVer
,
NumGemmKPrefetchStage
>
())
>
;
decltype
(
GridwiseGemmPipeline_Selector
<
PipelineVer
,
NumGemmKPrefetchStage
>
())
>
;
__host__
__device__
static
constexpr
auto
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
()
__host__
__device__
static
constexpr
auto
GetABlockDescriptor_
A
K0PerBlock_MPerBlock_
A
K1
()
{
{
constexpr
auto
max_lds_align
=
K1
;
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_block_desc_k0_m_k1
=
[
&
]()
{
constexpr
auto
a_block_desc_
a
k0_m_
a
k1
=
[
&
]()
{
if
constexpr
(
ABlockLdsExtraM
)
if
constexpr
(
ABlockLdsExtraM
)
{
{
return
make_naive_tensor_descriptor
(
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
K0PerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
make_tuple
(
Number
<
A
K0PerBlock
>
{},
Number
<
MPerBlock
>
{},
A
K1
),
make_tuple
(
Number
<
MPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
make_tuple
(
Number
<
MPerBlock
+
1
>
{}
*
A
K1
,
A
K1
,
I1
));
}
}
else
else
{
{
return
make_naive_tensor_descriptor_aligned
(
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
K0PerBlock
>
{},
Number
<
MPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
A
K0PerBlock
>
{},
Number
<
MPerBlock
>
{},
A
K1
),
max_lds_align
);
}
}
}();
}();
return
a_block_desc_k0_m_k1
;
return
a_block_desc_
a
k0_m_
a
k1
;
}
}
__host__
__device__
static
constexpr
auto
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
()
__host__
__device__
static
constexpr
auto
GetBBlockDescriptor_
B
K0PerBlock_NPerBlock_
B
K1
()
{
{
constexpr
auto
max_lds_align
=
K1
;
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_block_desc_k0_n_k1
=
[
&
]()
{
constexpr
auto
b_block_desc_
b
k0_n_
b
k1
=
[
&
]()
{
if
constexpr
(
BBlockLdsExtraN
)
if
constexpr
(
BBlockLdsExtraN
)
{
{
return
make_naive_tensor_descriptor
(
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
K0PerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
make_tuple
(
Number
<
B
K0PerBlock
>
{},
Number
<
NPerBlock
>
{},
B
K1
),
make_tuple
(
Number
<
NPerBlock
+
1
>
{}
*
K1
,
K1
,
I1
));
make_tuple
(
Number
<
NPerBlock
+
1
>
{}
*
B
K1
,
B
K1
,
I1
));
}
}
else
else
{
{
return
make_naive_tensor_descriptor_aligned
(
return
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
K0PerBlock
>
{},
Number
<
NPerBlock
>
{},
K1
),
max_lds_align
);
make_tuple
(
Number
<
B
K0PerBlock
>
{},
Number
<
NPerBlock
>
{},
B
K1
),
max_lds_align
);
}
}
}();
}();
return
b_block_desc_k0_n_k1
;
return
b_block_desc_
b
k0_n_
b
k1
;
}
}
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_desc_k0_m_k1
=
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
();
constexpr
auto
a_block_desc_ak0_m_ak1
=
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
();
constexpr
auto
b_block_desc_k0_n_k1
=
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
();
constexpr
auto
b_block_desc_bk0_n_bk1
=
GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
();
constexpr
auto
max_lds_align
=
K1
;
constexpr
auto
a_block_space_size_aligned
=
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
math
::
integer_least_multiple
(
a_block_desc_k0_m_k1
.
GetElementSpaceSize
(),
max_lds_align
);
a_block_desc_
a
k0_m_
a
k1
.
GetElementSpaceSize
(),
max_lds_align
);
constexpr
auto
b_block_space_size_aligned
=
constexpr
auto
b_block_space_size_aligned
=
math
::
integer_least_multiple
(
math
::
integer_least_multiple
(
b_block_desc_k0_n_k1
.
GetElementSpaceSize
(),
max_lds_align
);
b_block_desc_
b
k0_n_
b
k1
.
GetElementSpaceSize
(),
max_lds_align
);
return
(
a_block_space_size_aligned
+
b_block_space_size_aligned
)
*
sizeof
(
ABDataType
);
return
(
a_block_space_size_aligned
+
b_block_space_size_aligned
)
*
sizeof
(
ABDataType
);
}
}
__host__
static
constexpr
bool
CheckValidity
(
const
Problem
&
problem
)
__host__
static
constexpr
bool
CheckValidity
(
const
Problem
&
problem
)
{
{
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
decltype
(
K1
)
>>::
value
,
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
decltype
(
AK1
)
>>::
value
,
"Wrong! K1 must be known at the time of compilation."
);
"Wrong! AK1 must be known at the time of compilation."
);
static_assert
(
is_known_at_compile_time
<
remove_cv_t
<
decltype
(
BK1
)
>>::
value
,
"Wrong! BK1 must be known at the time of compilation."
);
static_assert
(
static_assert
(
MPerBlock
%
(
MPerDpp
*
MDppPerWave
)
==
0
,
MPerBlock
%
(
MPerDpp
*
MDppPerWave
)
==
0
,
...
@@ -267,6 +274,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -267,6 +274,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
NPerBlock
%
(
NPerDpp
*
NDppPerWave
)
==
0
,
NPerBlock
%
(
NPerDpp
*
NDppPerWave
)
==
0
,
"Invalid tuning parameters! NPerBlock must be divisible by NPerDpp * NDppPerWave."
);
"Invalid tuning parameters! NPerBlock must be divisible by NPerDpp * NDppPerWave."
);
static_assert
(
KPerBlock
%
AK1Value
==
0
&&
KPerBlock
%
BK1Value
==
0
,
"Invalid tuning parameters! KPerBlock must be divisible by both AK1 and BK1."
);
static_assert
(
AK1Value
%
ABlockTransferDstScalarPerVector_K1
==
0
,
"Invalid tuning parameters! AK1Value must be divisible by "
"ABlockTransferDstScalarPerVector_K1"
);
static_assert
(
BK1Value
%
BBlockTransferDstScalarPerVector_K1
==
0
,
"Invalid tuning parameters! BK1Value must be divisible by "
"BBlockTransferDstScalarPerVector_K1"
);
if
constexpr
(
!
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MPadding
||
if
constexpr
(
!
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MPadding
||
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
||
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
||
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MKPadding
||
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MKPadding
||
...
@@ -319,10 +338,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -319,10 +338,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
}
}
}
}
// check gridwise gemm pipeline
if
(
problem
.
K
%
KPerBlock
!=
0
)
const
index_t
K0
=
problem
.
K
/
K1
;
{
const
auto
num_k_loop
=
K0
/
K0PerBlock
;
return
false
;
}
// check gridwise gemm pipeline
const
auto
num_k_loop
=
problem
.
K
/
KPerBlock
;
if
(
!
GridwiseGemmPipe
::
IsSupported
(
num_k_loop
))
if
(
!
GridwiseGemmPipe
::
IsSupported
(
num_k_loop
))
{
{
return
false
;
return
false
;
...
@@ -333,7 +355,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -333,7 +355,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
__host__
static
constexpr
bool
CalculateHasMainKBlockLoop
(
index_t
K
)
__host__
static
constexpr
bool
CalculateHasMainKBlockLoop
(
index_t
K
)
{
{
const
index_t
num_loop
=
K
/
(
K0
PerBlock
*
K1
)
;
const
auto
num_loop
=
K
/
K
PerBlock
;
return
GridwiseGemmPipe
::
CalculateHasMainLoop
(
num_loop
);
return
GridwiseGemmPipe
::
CalculateHasMainLoop
(
num_loop
);
}
}
...
@@ -342,30 +364,33 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -342,30 +364,33 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
const
CGridDesc
&
c_grid_desc_m_n
)
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
const
CGridDesc
&
c_grid_desc_m_n
)
{
{
constexpr
auto
a_block_desc_k0_m_k1
=
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
();
constexpr
auto
a_block_desc_ak0_m_ak1
=
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
();
constexpr
auto
b_block_desc_k0_n_k1
=
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
();
constexpr
auto
b_block_desc_bk0_n_bk1
=
GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
();
constexpr
index_t
KPack
=
math
::
max
(
math
::
lcm
(
AK1
,
BK1
),
DppSelector
<
ABDataType
,
MPerDpp
,
NPerDpp
>::
selected_dpp
.
k_per_dpp
);
using
BlockwiseGemm
=
using
BlockwiseGemm
=
BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
<
BlockSize
,
BlockwiseGemmDpp_
a
k0m
a
k1_
b
k0n
b
k1_m0n0m1n1m2n2
<
BlockSize
,
ABDataType
,
ABDataType
,
AccDataType
,
AccDataType
,
decltype
(
a_block_desc_k0_m_k1
),
decltype
(
a_block_desc_
a
k0_m_
a
k1
),
decltype
(
b_block_desc_k0_n_k1
),
decltype
(
b_block_desc_
b
k0_n_
b
k1
),
MPerDpp
,
MPerDpp
,
NPerDpp
,
NPerDpp
,
MDppPerWave
,
MDppPerWave
,
NDppPerWave
,
NDppPerWave
,
K1
>
;
KPack
>
;
return
BlockwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
c_grid_desc_m_n
);
return
BlockwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
c_grid_desc_m_n
);
}
}
static
constexpr
auto
matrix_padder
=
static
constexpr
auto
matrix_padder
=
ck
::
tensor_operation
::
device
::
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
ck
::
tensor_operation
::
device
::
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
K
0
PerBlock
*
K1
};
MPerBlock
,
NPerBlock
,
KPerBlock
};
__device__
static
auto
__device__
static
auto
MakeAGridDescriptor_K0_M_K1
(
index_t
M
,
index_t
K
,
index_t
K0
,
index_t
StrideA
)
MakeAGridDescriptor_
A
K0_M_
A
K1
(
index_t
M
,
index_t
K
,
index_t
A
K0
,
index_t
StrideA
)
{
{
const
auto
a_grid_desc_mraw_kraw
=
[
&
]()
{
const
auto
a_grid_desc_mraw_kraw
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
...
@@ -381,14 +406,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -381,14 +406,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
const
auto
a_grid_desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
a_grid_desc_mraw_kraw
);
const
auto
a_grid_desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
a_grid_desc_mraw_kraw
);
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Value
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
A
K0
,
A
K1Value
)),
make_pass_through_transform
(
M
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
__device__
static
auto
__device__
static
auto
MakeBGridDescriptor_K0_N_K1
(
index_t
K
,
index_t
N
,
index_t
K0
,
index_t
StrideB
)
MakeBGridDescriptor_
B
K0_N_
B
K1
(
index_t
K
,
index_t
N
,
index_t
B
K0
,
index_t
StrideB
)
{
{
const
auto
b_grid_desc_nraw_kraw
=
[
&
]()
{
const
auto
b_grid_desc_nraw_kraw
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
...
@@ -405,7 +430,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -405,7 +430,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
b_grid_desc_n_k
,
b_grid_desc_n_k
,
make_tuple
(
make_pass_through_transform
(
N
),
make_tuple
(
make_pass_through_transform
(
N
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1Value
))),
make_unmerge_transform
(
make_tuple
(
B
K0
,
B
K1Value
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}));
}
}
...
@@ -427,24 +452,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -427,24 +452,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
}
}
template
<
bool
HasMainKBlockLoop
,
template
<
bool
HasMainKBlockLoop
,
typename
AGridDesc_K0_M_K1
,
typename
AGridDesc_
A
K0_M_
A
K1
,
typename
BGridDesc_K0_N_K1
,
typename
BGridDesc_
B
K0_N_
B
K1
,
typename
CGridDesc_M_N
>
typename
CGridDesc_M_N
>
__device__
static
void
Run
(
const
ABDataType
*
__restrict__
p_a_grid
,
__device__
static
void
Run
(
const
ABDataType
*
__restrict__
p_a_grid
,
const
ABDataType
*
__restrict__
p_b_grid
,
const
ABDataType
*
__restrict__
p_b_grid
,
CDataType
*
__restrict__
p_c_grid
,
CDataType
*
__restrict__
p_c_grid
,
void
*
__restrict__
p_shared
,
void
*
__restrict__
p_shared
,
const
AGridDesc_K0_M_K1
&
a_grid_desc_k0_m_k1
,
const
AGridDesc_
A
K0_M_
A
K1
&
a_grid_desc_
a
k0_m_
a
k1
,
const
BGridDesc_K0_N_K1
&
b_grid_desc_k0_n_k1
,
const
BGridDesc_
B
K0_N_
B
K1
&
b_grid_desc_
b
k0_n_
b
k1
,
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
{
{
const
auto
c_grid_desc_m0_n0_m1_n1_m2_n2
=
const
auto
c_grid_desc_m0_n0_m1_n1_m2_n2
=
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
c_grid_desc_m_n
);
MakeCGridDescriptor_M0_N0_M1_N1_M2_N2
(
c_grid_desc_m_n
);
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_k0_m_k1
.
GetElementSpaceSize
());
p_a_grid
,
a_grid_desc_
a
k0_m_
a
k1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_k0_n_k1
.
GetElementSpaceSize
());
p_b_grid
,
b_grid_desc_
b
k0_n_
b
k1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_m0_n0_m1_n1_m2_n2
.
GetElementSpaceSize
());
p_c_grid
,
c_grid_desc_m0_n0_m1_n1_m2_n2
.
GetElementSpaceSize
());
...
@@ -473,25 +498,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -473,25 +498,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
const
index_t
n_block_data_idx_on_grid
=
const
index_t
n_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
NPerBlock
);
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
NPerBlock
);
constexpr
auto
max_lds_align
=
K1
;
// A matrix in LDS memory, dst of blockwise copy
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_block_desc_k0_m_k1
=
GetABlockDescriptor_K0PerBlock_MPerBlock_K1
();
constexpr
auto
a_block_desc_
a
k0_m_
a
k1
=
GetABlockDescriptor_
A
K0PerBlock_MPerBlock_
A
K1
();
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_block_desc_k0_n_k1
=
GetBBlockDescriptor_K0PerBlock_NPerBlock_K1
();
constexpr
auto
b_block_desc_
b
k0_n_
b
k1
=
GetBBlockDescriptor_
B
K0PerBlock_NPerBlock_
B
K1
();
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
AElementwiseOperation
,
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
K0PerBlock
,
MPerBlock
,
K1
>
,
Sequence
<
A
K0PerBlock
,
MPerBlock
,
A
K1
>
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferThreadClusterArrangeOrder
,
ABDataType
,
ABDataType
,
ABDataType
,
ABDataType
,
decltype
(
a_grid_desc_k0_m_k1
),
decltype
(
a_grid_desc_
a
k0_m_
a
k1
),
decltype
(
a_block_desc_k0_m_k1
),
decltype
(
a_block_desc_
a
k0_m_
a
k1
),
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcAccessOrder
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcVectorDim
,
...
@@ -503,10 +526,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -503,10 +526,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
AThreadTransferSrcResetCoordinateAfterRun
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
,
true
,
NumGemmKPrefetchStage
>
(
NumGemmKPrefetchStage
>
(
a_grid_desc_k0_m_k1
,
a_grid_desc_
a
k0_m_
a
k1
,
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
a_element_op
,
a_element_op
,
a_block_desc_k0_m_k1
,
a_block_desc_
a
k0_m_
a
k1
,
make_multi_index
(
0
,
0
,
0
),
make_multi_index
(
0
,
0
,
0
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
...
@@ -515,13 +538,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -515,13 +538,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
BElementwiseOperation
,
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
K0PerBlock
,
NPerBlock
,
K1
>
,
Sequence
<
B
K0PerBlock
,
NPerBlock
,
B
K1
>
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferThreadClusterArrangeOrder
,
ABDataType
,
ABDataType
,
ABDataType
,
ABDataType
,
decltype
(
b_grid_desc_k0_n_k1
),
decltype
(
b_grid_desc_
b
k0_n_
b
k1
),
decltype
(
b_block_desc_k0_n_k1
),
decltype
(
b_block_desc_
b
k0_n_
b
k1
),
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcAccessOrder
,
Sequence
<
1
,
0
,
2
>
,
Sequence
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
...
@@ -533,59 +556,62 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
...
@@ -533,59 +556,62 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
BThreadTransferSrcResetCoordinateAfterRun
,
BThreadTransferSrcResetCoordinateAfterRun
,
true
,
true
,
NumGemmKPrefetchStage
>
(
NumGemmKPrefetchStage
>
(
b_grid_desc_k0_n_k1
,
b_grid_desc_
b
k0_n_
b
k1
,
make_multi_index
(
0
,
n_block_data_idx_on_grid
,
0
),
make_multi_index
(
0
,
n_block_data_idx_on_grid
,
0
),
b_element_op
,
b_element_op
,
b_block_desc_k0_n_k1
,
b_block_desc_
b
k0_n_
b
k1
,
make_multi_index
(
0
,
0
,
0
),
make_multi_index
(
0
,
0
,
0
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[K0PerBlock, MPerBlock] is in LDS
// a_mtx[
A
K0PerBlock, MPerBlock] is in LDS
// b_mtx[K0PerBlock, NPerBlock] is in LDS
// b_mtx[
B
K0PerBlock, NPerBlock] is in LDS
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
// register
constexpr
index_t
KPack
=
math
::
max
(
math
::
lcm
(
AK1
,
BK1
),
DppSelector
<
ABDataType
,
MPerDpp
,
NPerDpp
>::
selected_dpp
.
k_per_dpp
);
auto
blockwise_gemm
=
auto
blockwise_gemm
=
BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
<
BlockSize
,
BlockwiseGemmDpp_
a
k0m
a
k1_
b
k0n
b
k1_m0n0m1n1m2n2
<
BlockSize
,
ABDataType
,
ABDataType
,
AccDataType
,
AccDataType
,
decltype
(
a_block_desc_k0_m_k1
),
decltype
(
a_block_desc_
a
k0_m_
a
k1
),
decltype
(
b_block_desc_k0_n_k1
),
decltype
(
b_block_desc_
b
k0_n_
b
k1
),
MPerDpp
,
MPerDpp
,
NPerDpp
,
NPerDpp
,
MDppPerWave
,
MDppPerWave
,
NDppPerWave
,
NDppPerWave
,
K1
>
();
KPack
>
();
auto
c_thread_buf
=
blockwise_gemm
.
GetCThreadBuffer
();
auto
c_thread_buf
=
blockwise_gemm
.
GetCThreadBuffer
();
// LDS allocation for A and B: be careful of alignment
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size_aligned
=
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
math
::
integer_least_multiple
(
a_block_desc_k0_m_k1
.
GetElementSpaceSize
(),
max_lds_align
);
a_block_desc_
a
k0_m_
a
k1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ABDataType
*>
(
p_shared
),
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
static_cast
<
ABDataType
*>
(
p_shared
),
a_block_desc_
a
k0_m_
a
k1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ABDataType
*>
(
p_shared
)
+
a_block_space_size_aligned
,
static_cast
<
ABDataType
*>
(
p_shared
)
+
a_block_space_size_aligned
,
b_block_desc_k0_n_k1
.
GetElementSpaceSize
());
b_block_desc_
b
k0_n_
b
k1
.
GetElementSpaceSize
());
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
A
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
K0PerBlock
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
B
K0PerBlock
,
0
,
0
);
// gridwise GEMM pipeline
// gridwise GEMM pipeline
const
auto
K0
=
a_grid_desc_k0_m_k1
.
GetLength
(
I0
);
const
auto
AK0
=
a_grid_desc_ak0_m_ak1
.
GetLength
(
I0
);
const
index_t
num_k_block_main_loop
=
__builtin_amdgcn_readfirstlane
(
K0
/
K0PerBlock
);
// (AK0 / AK0PerBlock) is always equal to (BK0 / BK0PerBlock)
const
index_t
num_k_block_main_loop
=
__builtin_amdgcn_readfirstlane
(
AK0
/
AK0PerBlock
);
GridwiseGemmPipe
::
template
Run
<
HasMainKBlockLoop
>(
a_grid_desc_k0_m_k1
,
GridwiseGemmPipe
::
template
Run
<
HasMainKBlockLoop
>(
a_grid_desc_
a
k0_m_
a
k1
,
a_block_desc_k0_m_k1
,
a_block_desc_
a
k0_m_
a
k1
,
a_blockwise_copy
,
a_blockwise_copy
,
a_grid_buf
,
a_grid_buf
,
a_block_buf
,
a_block_buf
,
a_block_slice_copy_step
,
a_block_slice_copy_step
,
b_grid_desc_k0_n_k1
,
b_grid_desc_
b
k0_n_
b
k1
,
b_block_desc_k0_n_k1
,
b_block_desc_
b
k0_n_
b
k1
,
b_blockwise_copy
,
b_blockwise_copy
,
b_grid_buf
,
b_grid_buf
,
b_block_buf
,
b_block_buf
,
...
...
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
View file @
77e63b34
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
// clang-format off
// clang-format off
using
device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
=
std
::
tuple
<
using
device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
=
std
::
tuple
<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K
0
Per| K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer|
KPer|
AK1| B
K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block|
|
| Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | Operation| Operation| Operation| | | | | |
|
| | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ########| | | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
4
,
4
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
4
,
4
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
64
,
4
,
4
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
32
,
4
,
4
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
64
,
4
,
4
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
32
,
4
,
4
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
2
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
5
,
1
>
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
16
,
4
,
4
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
5
,
1
>
>
;
>
;
// clang-format on
// clang-format on
...
...
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
View file @
77e63b34
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
// clang-format off
// clang-format off
using
device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
=
std
::
tuple
<
using
device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
=
std
::
tuple
<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K
0
Per| K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer|
KPer|
AK1| B
K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block|
|
| Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | Operation| Operation| Operation| | | | | |
|
| | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ########| | | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
4
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
4
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
64
,
4
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
64
,
4
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
2
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
16
,
4
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
>
;
>
;
// clang-format on
// clang-format on
...
...
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
View file @
77e63b34
...
@@ -29,17 +29,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
...
@@ -29,17 +29,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
// clang-format off
// clang-format off
using
device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
=
std
::
tuple
<
using
device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
=
std
::
tuple
<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K
0
Per| K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer|
KPer|
AK1| B
K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block|
|
| Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | Operation| Operation| Operation| | | | | |
|
| | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ########| | | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
8
,
4
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
8
,
4
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
64
,
8
,
4
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
32
,
8
,
4
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
64
,
8
,
4
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
2
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
32
,
8
,
4
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
2
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
true
,
5
,
1
>
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
16
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
2
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
true
,
5
,
1
>
>
;
>
;
// clang-format on
// clang-format on
...
...
library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
77e63b34
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
...
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
// clang-format off
// clang-format off
using
device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
=
std
::
tuple
<
using
device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
=
std
::
tuple
<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K
0
Per| K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer|
KPer|
AK1| B
K1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block|
|
| Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | Operation| Operation| Operation| | | | | |
|
| | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ########| | | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
32
,
32
,
32
,
8
,
8
,
32
,
8
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
64
,
8
,
8
,
32
,
8
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
4
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
32
,
8
,
8
,
32
,
8
,
1
,
4
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
,
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
2
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
DeviceGemmDpp
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
16
,
16
,
16
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
5
,
1
>
>
;
>
;
// clang-format on
// clang-format on
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment