"...composable_kernel_rocm.git" did not exist on "50b96745c68d17c3c03b4492d23867eb5e859aa7"
Commit b6116d2f authored by Jing Zhang's avatar Jing Zhang
Browse files

clean

parent abd9c245
......@@ -331,37 +331,10 @@ void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif 1
constexpr index_t BlockSize = 64;
constexpr index_t MPerBlock = 48;
constexpr index_t NPerBlock = 32;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 16;
constexpr index_t NPerXDL = 16;
constexpr index_t K1 = 8;
constexpr index_t MRepeat = 3;
constexpr index_t NRepeat = 2;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<4, 1, 8>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<1, 48, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<4, 1, 8>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<1, 32, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 1;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
constexpr index_t CThreadTransferDstScalarPerVector = 1;
#elif 1
constexpr index_t BlockSize = 64;
constexpr index_t BlockSize = 256;
constexpr index_t MPerBlock = 48;
constexpr index_t NPerBlock = 16;
constexpr index_t MPerBlock = 96;
constexpr index_t NPerBlock = 128;
constexpr index_t KPerBlock = 4;
constexpr index_t MPerXDL = 16;
......@@ -369,18 +342,18 @@ void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
constexpr index_t K1 = 8;
constexpr index_t MRepeat = 3;
constexpr index_t NRepeat = 1;
constexpr index_t NRepeat = 4;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<4, 1, 8>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<1, 48, 1>;
using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 3, 8>;
using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<4, 1, 8>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<1, 16, 1>;
using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 8>;
using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 1;
constexpr index_t BBlockTransferSrcScalarPerVector_N = 2;
constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
constexpr index_t CThreadTransferDstScalarPerVector = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment