"include/ck/utility/math.hpp" did not exist on "21f7e9f103231b01889ff30ed9f016fc89d3a669"
Unverified Commit 547dbcfb authored by Bartlomiej Wroblewski's avatar Bartlomiej Wroblewski Committed by GitHub
Browse files

Add new instances and support for small cases in DPP8 GEMM (#896)

parent 85e2e1e2
......@@ -11,8 +11,14 @@ namespace ck {
enum struct DppInstr
{
dpp8_f16_16x16x2 = 0,
dpp8_f16_1x32x2 = 0,
dpp8_f16_2x16x2,
dpp8_f16_2x32x2,
dpp8_f16_4x16x2,
dpp8_f16_4x32x2,
dpp8_f16_8x16x2,
dpp8_f16_8x32x2,
dpp8_f16_16x16x2,
dpp8_f16_32x8x2
};
......@@ -101,6 +107,36 @@ struct dpp_type<DppInstr::dpp8_f16_8x32x2>
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_8x16x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 8;
static constexpr index_t n_per_wave = 16;
static constexpr index_t m_per_lanegroup = 4;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 4;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_16x16x2>
{
......@@ -131,6 +167,156 @@ struct dpp_type<DppInstr::dpp8_f16_16x16x2>
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_4x32x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 4;
static constexpr index_t n_per_wave = 32;
static constexpr index_t m_per_lanegroup = 4;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 4;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_4x16x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 4;
static constexpr index_t n_per_wave = 16;
static constexpr index_t m_per_lanegroup = 2;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 2;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_1x32x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 1;
static constexpr index_t n_per_wave = 32;
static constexpr index_t m_per_lanegroup = 1;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 1;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_2x32x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 2;
static constexpr index_t n_per_wave = 32;
static constexpr index_t m_per_lanegroup = 2;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 2;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <>
struct dpp_type<DppInstr::dpp8_f16_2x16x2>
{
static constexpr index_t wave_size = 32;
static constexpr index_t lanegroup_size = 8;
static constexpr index_t m_per_wave = 2;
static constexpr index_t n_per_wave = 16;
static constexpr index_t m_per_lanegroup = 1;
static constexpr index_t n_per_lanegroup = 8;
static constexpr index_t m_per_thread = 1;
static constexpr index_t n_per_thread = 1;
static constexpr index_t k_per_dpp = 2;
static constexpr bool share_a = true;
using BaseType = half_t;
template <index_t MPerDpp, index_t NPerDpp, class ADataType, class BDataType, class CDataType>
__device__ void run(const ADataType& a, const BDataType& b, CDataType& reg_c) const
{
dpp8::DppLanegroupGemm<m_per_thread,
n_per_thread,
k_per_dpp,
BaseType,
ADataType,
BDataType,
CDataType,
share_a>{}
.Run(a, b, reg_c);
}
};
template <typename BaseType, index_t MPerDpp, index_t NPerDpp>
struct DppSelector
{
......@@ -143,6 +329,12 @@ struct DppSelector
return DppInstr::dpp8_f16_8x32x2;
}
template <>
static constexpr auto GetDpp<half_t, 8, 16>()
{
return DppInstr::dpp8_f16_8x16x2;
}
template <>
static constexpr auto GetDpp<half_t, 16, 16>()
{
......@@ -155,6 +347,36 @@ struct DppSelector
return DppInstr::dpp8_f16_32x8x2;
}
template <>
static constexpr auto GetDpp<half_t, 1, 32>()
{
return DppInstr::dpp8_f16_1x32x2;
}
template <>
static constexpr auto GetDpp<half_t, 2, 32>()
{
return DppInstr::dpp8_f16_2x32x2;
}
template <>
static constexpr auto GetDpp<half_t, 2, 16>()
{
return DppInstr::dpp8_f16_2x16x2;
}
template <>
static constexpr auto GetDpp<half_t, 4, 16>()
{
return DppInstr::dpp8_f16_4x16x2;
}
template <>
static constexpr auto GetDpp<half_t, 4, 32>()
{
return DppInstr::dpp8_f16_4x32x2;
}
static constexpr auto selected_dpp = dpp_type<GetDpp<BaseType, MPerDpp, NPerDpp>()>{};
__host__ __device__ constexpr DppSelector()
......@@ -191,7 +413,6 @@ struct DppSelector
// in the future when the implementation is more generalized.
static_assert(selected_dpp.share_a);
static_assert(selected_dpp.n_per_thread == 1);
static_assert(selected_dpp.m_per_thread == selected_dpp.lanegroup_size);
static_assert(selected_dpp.m_per_lanegroup == selected_dpp.m_per_thread);
static_assert(selected_dpp.n_per_lanegroup ==
selected_dpp.n_per_thread * selected_dpp.lanegroup_size);
......@@ -215,11 +436,6 @@ struct DppGemm
__host__ __device__ constexpr DppGemm()
{
static_assert(MPerDpp == 8 || MPerDpp == 16 || MPerDpp == 32,
"MPerDpp must be either 8, 16 or 32.");
static_assert(NPerDpp == 8 || NPerDpp == 16 || NPerDpp == 32,
"NPerDpp must be either 8, 16 or 32.");
static_assert(KPack % dpp_instr.k_per_dpp == 0, "KPack must be divisible by k_per_dpp.");
}
......
......@@ -23,12 +23,17 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
......@@ -38,12 +43,17 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
......@@ -53,12 +63,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
......@@ -68,12 +83,17 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(
void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances);
......@@ -375,6 +395,7 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
}
......@@ -386,6 +407,7 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
......@@ -398,6 +420,7 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
}
......@@ -409,6 +432,7 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
}
......
......@@ -32,9 +32,13 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp)
endif()
list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp)
list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp)
......
......@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_km_kn_mn_instances = std::tuple<
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 4, 4, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 4, 4, 32, 8, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 64, 64, 4, 4, 16, 16, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 64, 64, 4, 4, 32, 8, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 64, 64, 4, 4, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 32, 32, 4, 4, 32, 8, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 64, 4, 4, 32, 8, 2, 4, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 32, 32, 32, 4, 4, 32, 8, 1, 4, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 16, 16, 16, 4, 4, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 1, 128, 64, 4, 4, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 32, 64, 4, 4, 8, 32, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 16, 64, 4, 4, 8, 16, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 4, 32, 64, 4, 4, 2, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 64, 64, 4, 4, 2, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 32, 64, 4, 4, 1, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 1, 64, 64, 4, 4, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 64, 4, 4, 4, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 32, 4, 4, 8, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 4, 32, 32, 4, 4, 4, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 2, 16, 32, 4, 4, 2, 16, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>
>;
// clang-format on
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
// clang-format off
using device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances = std::tuple<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 128, 128, 64, 4, 4, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 64, 64, 64, 4, 4, 16, 16, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 64, 64, 64, 4, 4, 32, 8, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 32, 32, 64, 4, 4, 32, 8, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 4, 32, 64, 4, 4, 1, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 64, 64, 32, 4, 4, 32, 8, 2, 4, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 32, 64, 4, 4, 8, 32, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 16, 64, 4, 4, 8, 16, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 2, 32, 64, 4, 4, 1, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 32, 16, 64, 4, 4, 16, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 8, 16, 64, 4, 4, 4, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 1, 32, 64, 4, 4, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>
>;
// clang-format on
void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_km_nk_mn_instances = std::tuple<
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 4, 8, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 4, 8, 32, 8, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 64, 64, 4, 8, 16, 16, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 64, 64, 4, 8, 32, 8, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 64, 64, 4, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 32, 32, 4, 8, 32, 8, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 64, 4, 8, 32, 8, 2, 4, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 32, 32, 32, 4, 8, 32, 8, 1, 4, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 16, 16, 16, 4, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 1, 128, 64, 4, 8, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 32, 64, 4, 8, 8, 32, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 16, 64, 4, 8, 8, 16, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 4, 32, 64, 4, 8, 2, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 64, 64, 4, 8, 2, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 32, 64, 4, 8, 1, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 1, 64, 64, 4, 8, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 64, 4, 8, 4, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 32, 4, 8, 8, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 4, 32, 32, 4, 8, 4, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 2, 16, 32, 4, 8, 2, 16, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
>;
// clang-format on
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
// clang-format off
using device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances = std::tuple<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 128, 128, 64, 4, 8, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 64, 64, 64, 4, 8, 32, 8, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 32, 64, 64, 4, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 32, 32, 64, 4, 8, 32, 8, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 4, 32, 64, 4, 8, 1, 32, 1, 1, S<4, 4, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 64, 64, 32, 4, 8, 32, 8, 2, 4, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 32, 64, 4, 8, 8, 32, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 16, 64, 4, 8, 8, 16, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 2, 32, 64, 4, 8, 1, 32, 1, 1, S<4, 2, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 32, 16, 64, 4, 8, 16, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 8, 16, 64, 4, 8, 4, 16, 2, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Col, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 1, 32, 64, 4, 8, 1, 32, 1, 1, S<4, 1, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
>;
// clang-format on
void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -35,11 +35,21 @@ using device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances = std::tuple<
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 4, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 4, 32, 8, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 64, 64, 8, 4, 16, 16, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 64, 64, 8, 4, 32, 8, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 64, 64, 8, 4, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 32, 32, 8, 4, 32, 8, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 64, 8, 4, 32, 8, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 32, 32, 32, 8, 4, 32, 8, 1, 4, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 16, 16, 16, 8, 4, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 1, 128, 64, 8, 4, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 32, 64, 8, 4, 8, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 16, 64, 8, 4, 8, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 4, 32, 64, 8, 4, 2, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 64, 64, 8, 4, 2, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 32, 64, 8, 4, 1, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 1, 64, 64, 8, 4, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 64, 8, 4, 4, 16, 2, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 32, 8, 4, 8, 16, 1, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 4, 32, 32, 8, 4, 4, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 2, 16, 32, 8, 4, 2, 16, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>
>;
// clang-format on
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
// clang-format off
using device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 128, 128, 64, 8, 4, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 64, 64, 64, 8, 4, 16, 16, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 64, 64, 64, 8, 4, 32, 8, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 32, 32, 64, 8, 4, 32, 8, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 4, 32, 64, 8, 4, 1, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 64, 64, 32, 8, 4, 32, 8, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 32, 64, 8, 4, 8, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 16, 64, 8, 4, 8, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 2, 32, 64, 8, 4, 1, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 32, 16, 64, 8, 4, 16, 16, 2, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 8, 16, 64, 8, 4, 4, 16, 2, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 1, 32, 64, 8, 4, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, true, 5, 1>
>;
// clang-format on
void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -36,11 +36,21 @@ using device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances = std::tuple<
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 32, 8, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 64, 64, 8, 8, 16, 16, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 64, 64, 8, 8, 32, 8, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 64, 64, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 32, 32, 8, 8, 32, 8, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 64, 8, 8, 32, 8, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 32, 32, 32, 8, 8, 32, 8, 1, 4, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 16, 16, 16, 8, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 1, 128, 64, 8, 8, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 32, 64, 8, 8, 8, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 16, 16, 64, 8, 8, 8, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 4, 32, 64, 8, 8, 2, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 64, 64, 8, 8, 2, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 2, 32, 64, 8, 8, 1, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 1, 64, 64, 8, 8, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 64, 8, 8, 4, 16, 2, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 8, 16, 32, 8, 8, 8, 16, 1, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 4, 32, 32, 8, 8, 4, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 32, 2, 16, 32, 8, 8, 2, 16, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
>;
// clang-format on
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
// clang-format off
using device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances = std::tuple<
// ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 128, 128, 64, 8, 8, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 256, 64, 64, 64, 8, 8, 16, 16, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 64, 64, 64, 8, 8, 32, 8, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 32, 32, 64, 8, 8, 32, 8, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 128, 4, 32, 64, 8, 8, 1, 32, 1, 1, S<4, 4, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 64, 64, 32, 8, 8, 32, 8, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 32, 64, 8, 8, 8, 32, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 16, 16, 64, 8, 8, 8, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 64, 2, 32, 64, 8, 8, 1, 32, 1, 1, S<4, 2, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 32, 16, 64, 8, 8, 16, 16, 2, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 8, 16, 32, 8, 8, 8, 16, 1, 1, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>,
DeviceGemmDpp< F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, MNPadding, 32, 1, 32, 64, 8, 8, 1, 32, 1, 1, S<4, 1, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 8, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 5, 1>
>;
// clang-format on
void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(
std::vector<std::unique_ptr<
DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
instances)
{
add_device_operation_instances(instances,
device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment