Merge remote-tracking branch 'origin/develop' into cpu_avx2

b79df771 · carlushuang · 05d38218 · 63914743 · 05d38218 · 05d38218
Commit b79df771 authored Jul 12, 2022 by carlushuang
20 changed files
--- a/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
-#if 0
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                          in_n_c_hi_wi_desc,
-                                                                          out_n_k_ho_wo_desc,
-                                                                          conv_strides,
-                                                                          conv_dilations,
-                                                                          in_left_pads,
-                                                                          in_right_pads,
-                                                                          Number<GemmK1>{});
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
-    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
-                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM
-                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
-                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
-                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmM
-                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmN
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmN
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
-    constexpr auto out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0>{};
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time = driver_gemm_xdlops_v2r3<
-            BlockSize,
-            TInWei,
-            TAcc,
-            TOut,
-            InMemoryDataOperationEnum::Set,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
-            decltype(out_gemmm_gemmn_grid_desc),
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmMPerWave,
-            GemmNPerWave,
-            GemmK1,
-            MRepeat,
-            NRepeat,
-            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
-            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
-            Sequence<1, 0, 2>,
-            Sequence<1, 0, 2>,
-            2,
-            GemmABlockTransferSrcScalarPerVector_GemmK1,
-            GemmABlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
-            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
-            Sequence<0, 2, 1>,
-            Sequence<1, 0, 2>,
-            1,
-            GemmBBlockTransferSrcScalarPerVector_GemmN,
-            GemmBBlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
-            7,
-            GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
-            decltype(out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
-            false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
-                   static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
-                   static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
-                   wei_gemmk0_gemmm_gemmk1_grid_desc,
-                   in_gemmk0_gemmn_gemmk1_grid_desc,
-                   out_gemmm_gemmn_grid_desc,
-                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_step_hacks,
-                   out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
-                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
-                   nrepeat);
-        float perf = static_cast<float>(calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-#if 0
-__host__ __device__ static constexpr auto
-MakePaddedGridDescriptors(const AGridDesc_K0Raw_MRaw_K1& a_grid_desc_k0raw_mraw_k1,
-                          const BGridDesc_K0Raw_NRaw_K1& b_grid_desc_k0raw_nraw_k1,
-                          const CGridDesc_MRaw_NRaw& c_grid_desc_mraw_nraw)
-{
-    const auto K0Raw = a_grid_desc_k0raw_mraw_k1.GetLength(I0);
-    const auto K1    = a_grid_desc_k0raw_mraw_k1.GetLength(I2);
-    const auto MRaw  = c_grid_desc_mraw_nraw.GetLength(I0);
-    const auto NRaw  = c_grid_desc_mraw_nraw.GetLength(I1);
-    const auto K0Pad = math::integer_least_multiple(K0Raw, K0PerBlock) - K0Raw;
-    const auto MPad  = math::integer_least_multiple(MRaw, MPerBlock) - MRaw;
-    const auto NPad  = math::integer_least_multiple(NRaw, NPerBlock) - NRaw;
-    // A
-    const auto a_grid_desc_k0_m_k1 = [&]() {
-        if constexpr(DoPad_K0 && DoPad_M)
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_k0_m_k1,
-                make_tuple(make_right_pad_transform(K0Raw, K0Pad),
-                           make_right_pad_transform(MRaw, MPad),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else if constexpr(DoPad_K0 && !DoPad_M)
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_k0_m_k1,
-                make_tuple(make_right_pad_transform(K0Raw, K0Pad),
-                           make_pass_through_transform(MRaw),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else if constexpr(!DoPad_K0 && DoPad_M)
-        {
-            return transform_tensor_descriptor(
-                a_grid_desc_k0_m_k1,
-                make_tuple(make_pass_through_transform(K0Raw),
-                           make_right_pad_transform(MRaw, MPad),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return a_grid_desc_k0raw_mraw_k1;
-        }
-    }();
-    // B
-    const auto b_grid_desc_k0_n_k1 = [&]() {
-        if constexpr(DoPad_K0 && DoPad_N)
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k0_n_k1,
-                make_tuple(make_right_pad_transform(K0Raw, K0Pad),
-                           make_right_pad_transform(NRaw, NPad),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else if constexpr(DoPad_K0 && !DoPad_N)
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k0_n_k1,
-                make_tuple(make_right_pad_transform(K0Raw, K0Pad),
-                           make_pass_through_transform(NRaw),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else if constexpr(!DoPad_K0 && DoPad_N)
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k0_n_k1,
-                make_tuple(make_pass_through_transform(K0Raw),
-                           make_right_pad_transform(NRaw, NPad),
-                           make_pass_through_transform(K1)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-        }
-        else
-        {
-            return b_grid_desc_k0raw_nraw_k1;
-        }
-    }();
-    // C
-    const auto c_grid_desc_m_n = [&]() {
-        if constexpr(DoPad_M && DoPad_N)
-        {
-            return transform_tensor_descriptor(c_grid_desc_m_n,
-                                               make_tuple(make_right_pad_transform(MRaw, MPad),
-                                                          make_right_pad_transform(NRaw, NPad)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(DoPad_M && !DoPad_N)
-        {
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(!DoPad_M && DoPad_N)
-        {
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            reutnr c_grid_desc_m_n;
-        }
-    }();
-}
-#endif
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
-    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4], C = 128, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1       = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 4], C = 128, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [256, 256, 4, 8], C = 256, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 256;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 4;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 256;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 64;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 32, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 64;
-    constexpr index_t GemmKPerBlock = 4;
-    constexpr index_t GemmMPerXDL = 32;
-    constexpr index_t GemmNPerXDL = 32;
-    constexpr index_t GemmK1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 1, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(in_n_hi_wi_c_desc,
-                                                                      wei_k_y_x_c_desc,
-                                                                      out_n_ho_wo_k_desc,
-                                                                      conv_strides,
-                                                                      conv_dilations,
-                                                                      in_left_pads,
-                                                                      in_right_pads,
-                                                                      Number<GemmK1>{});
-#if 0 // debug
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
-    // HACK: hacks that control index calculation when iterating over A matrix
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
-#else
-    const auto in_gemmk0_gemmmraw_gemmk1_grid_desc          = descs[I0];
-    const auto GemmK0   = in_gemmk0_gemmmraw_gemmk1_grid_desc.GetLength(I0);
-    const auto GemmMRaw = in_gemmk0_gemmmraw_gemmk1_grid_desc.GetLength(I1);
-    const auto GemmMPad = math::integer_least_multiple(GemmMRaw, GemmMPerBlock) - GemmMRaw;
-    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(in_gemmk0_gemmmraw_gemmk1_grid_desc,
-                                    make_tuple(make_pass_through_transform(GemmK0),
-                                               make_right_pad_transform(GemmMRaw, GemmMPad),
-                                               make_pass_through_transform(GemmK1)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-    // HACK: hacks that control index calculation when iterating over A matrix
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GemmM
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}),  // 2+: GemmK1
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 0-: GemmK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GemmM
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{})); // 2-: GemmK1
-    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0>{};
-#endif
-    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
-    const auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
-                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
-                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
-                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
-                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
-                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
-    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0>{};
-#if 0
-    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
-    constexpr auto out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-#else
-    const auto out_gemmmraw_gemmn_grid_desc = descs[I2];
-    const auto GemmN = out_gemmmraw_gemmn_grid_desc.GetLength(I1);
-    const auto out_gemmm_gemmn_grid_desc =
-        transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
-                                    make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
-                                               make_pass_through_transform(GemmN)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-    constexpr auto out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-#endif
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time = driver_gemm_xdlops_v2r3<
-            BlockSize,
-            TInWei,
-            TAcc,
-            TOut,
-            InMemoryDataOperationEnum::Set,
-            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
-            decltype(out_gemmm_gemmn_grid_desc),
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmMPerXDL,
-            GemmNPerXDL,
-            GemmK1,
-            MRepeat,
-            NRepeat,
-            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
-            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
-            Sequence<1, 0, 2>,
-            Sequence<1, 0, 2>,
-            2,
-            GemmABlockTransferSrcScalarPerVector_GemmK1,
-            GemmABlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
-            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
-            Sequence<1, 0, 2>,
-            Sequence<1, 0, 2>,
-            2,
-            GemmBBlockTransferSrcScalarPerVector_GemmK1,
-            GemmBBlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-            7,
-            GemmCThreadTransferDstScalarPerVector,
-            decltype(in_gemmk0_gemmm_gemmk1_grid_step_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
-            decltype(out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
-            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
-            false, // CAccessOrderMRepeatNRepeat
-            true,  // ABlockLdsExtraM
-            true   // BBlockLdsExtraN
-            >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
-              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
-              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
-              in_gemmk0_gemmm_gemmk1_grid_desc,
-              wei_gemmk0_gemmn_gemmk1_grid_desc,
-              out_gemmm_gemmn_grid_desc,
-              debug::debug_driver_gemm_xdlops_v2r3::M01,
-              debug::debug_driver_gemm_xdlops_v2r3::N01,
-              in_gemmk0_gemmm_gemmk1_grid_step_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
-              out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
-              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
-              nrepeat);
-        {
-            const auto N = out_n_ho_wo_k_lengths[I0];
-            const auto K = out_n_ho_wo_k_lengths[I3];
-            const auto C = wei_k_y_x_c_lengths[I3];
-            const auto Ho = out_n_ho_wo_k_lengths[I1];
-            const auto Wo = out_n_ho_wo_k_lengths[I2];
-            const auto Y = wei_k_y_x_c_lengths[I1];
-            const auto X = wei_k_y_x_c_lengths[I2];
-            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-    }
-    // copy result back to host
-    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          ck::ActivTypeEnum activ_type,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
-    const InLengths& in_n_c0_hi_wi_c1_lengths,
-    const WeiLengths& wei_k_c0_y_x_c1_lengths,
-    const OutLengths& out_n_k0_ho_wo_k1_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
-    const Tensor<TInWei>& wei_k_c0_y_x_c1,
-    const Tensor<TOut>& bias_k0_k1,
-    Tensor<TOut>& out_n_k0_ho_wo_k1,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
-    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
-    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
-    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
-    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
-    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
-    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
-    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
-    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
-    const auto K = wei_k_c0_y_x_c1_lengths[I0];
-    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
-    const auto X = wei_k_c0_y_x_c1_lengths[I3];
-    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
-                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
-    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
-    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
-    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
-    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
-    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
-    constexpr index_t InWeiVectorSize = 8;
-    if(C1 % InWeiVectorSize != 0)
-    {
-        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
-    }
-#if 0
-    constexpr index_t BlockSize = 256;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 64;
-    constexpr index_t E1        = C0 * 9;
-    constexpr index_t E2        = 1;
-    constexpr index_t E1PerBlock = C0;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = 1;
-    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
-    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
-#elif 1
-    constexpr index_t BlockSize = 64;
-    constexpr index_t KPerBlock  = 8;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t E1         = 2 * 9;
-    constexpr index_t E2         = 1;
-    constexpr index_t K2         = 2;
-    constexpr index_t E1PerBlock = 2;
-    constexpr index_t KPerThread  = KPerBlock;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = 1;
-    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
-        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
-    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
-    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
-#endif
-    if(KPerThread % InWeiVectorSize != 0)
-    {
-        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
-    }
-    const auto in_n_c0_hi_wi_c1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
-    const auto wei_k_c0_y_x_c1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
-    const auto out_n_k0_ho_wo_k1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
-    constexpr auto conv_driver =
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
-            BlockSize,
-            typename vector_type<TInWei, InWeiVectorSize>::type,
-            TAcc,
-            TOut,
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            BThreadTransferSrcScalarPerVector_E2,
-            CThreadTransferDstScalarPerVector_K,
-            activ_type>{};
-    std::cerr << "conv_bias_activ_input_"
-              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
-              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
-              << "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
-    for(int i = 0; i < 5; i++)
-    {
-        const auto ave_time =
-            conv_driver.Run(wei_k_c0_y_x_c1_desc,
-                            in_n_c0_hi_wi_c1_desc,
-                            out_n_k0_ho_wo_k1_desc,
-                            conv_strides,
-                            conv_dilations,
-                            in_left_pads,
-                            in_right_pads,
-                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
-                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
-                            nrepeat);
-        {
-            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-    }
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "driver_contraction_dlops_v1r2.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    const auto in_desc_n_c_hi_wi  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
-    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
-    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
-#if 1
-    // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
-    // cdata = 64, BlockSize = 256
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GN0 = 4;
-    constexpr index_t GK1 = 1;
-    constexpr index_t GM1PerBlockGM11 = 128;
-    constexpr index_t GN1PerBlockGN11 = 32;
-    constexpr index_t GK0PerBlock     = 8;
-    constexpr index_t BM1PerThreadBM11 = 4;
-    constexpr index_t BN1PerThreadBN11 = 4;
-    constexpr index_t BK0PerThread     = 1;
-    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
-    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
-    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 1>;
-    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
-    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
-    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 1>;
-    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
-    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
-#elif 1
-    // [8, 1, 128, 2] * [8, 4, 32, 2] = [1, 128, 4, 32] for fp16
-    // cdata = 64, BlockSize = 256
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GN0 = 4;
-    constexpr index_t GK1 = 2;
-    constexpr index_t GM1PerBlockGM11 = 128;
-    constexpr index_t GN1PerBlockGN11 = 32;
-    constexpr index_t GK0PerBlock     = 8;
-    constexpr index_t BM1PerThreadBM11 = 4;
-    constexpr index_t BN1PerThreadBN11 = 4;
-    constexpr index_t BK0PerThread     = 1;
-    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
-    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
-    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 2>;
-    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
-    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
-    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
-    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 2>;
-    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
-    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
-    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
-#endif
-    const auto descs =
-        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_desc_k_c_y_x,
-                                                                               in_desc_n_c_hi_wi,
-                                                                               out_desc_n_k_ho_wo,
-                                                                               conv_strides,
-                                                                               conv_dilations,
-                                                                               in_left_pads,
-                                                                               in_right_pads,
-                                                                               Number<GN0>{},
-                                                                               Number<GK1>{});
-    const auto wei_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
-    const auto in_grid_desc_gk0_gn0_gn1_gk1  = descs[I1];
-    const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1+: GM0
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2+: GM10
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3+: GM11
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1-: GM0
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2-: GM10
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3-: GM11
-                              Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
-    constexpr auto in_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 3+: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GN0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 2-: GN10
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GN11
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
-    constexpr auto out_grid_step_hacks = make_tuple(
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GM10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 1-: BM0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 2-: BM1
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: GN10
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},   // 4-: BN0
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})); // 5-: GN1
-    constexpr auto wei_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
-    constexpr auto in_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time = driver_contraction_dlops_v1r2<
-            BlockSize,
-            TInWei,
-            TAcc,
-            TOut,
-            InMemoryDataOperationEnum::Set,
-            decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
-            decltype(in_grid_desc_gk0_gn0_gn1_gk1),
-            decltype(out_grid_desc_gm0_gm1_gn0_gn1),
-            GM1PerBlockGM11,
-            GN1PerBlockGN11,
-            GK0PerBlock,
-            BM1PerThreadBM11,
-            BN1PerThreadBN11,
-            BK0PerThread,
-            BM10BN10ThreadClusterBM10Xs,
-            BM10BN10ThreadClusterBN10Xs,
-            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-            Sequence<1, 2, 3, 0, 4>, // ABlockTransferThreadClusterArrangeOrder
-            Sequence<3, 2, 1, 0, 4>, // ABlockTransferSrcAccessOrder
-            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            Sequence<0, 1, 2, 3, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
-            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-            Sequence<0, 4, 1, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
-            Sequence<4, 3, 2, 0, 1>, // BBlockTransferSrcAccessOrder
-            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            Sequence<0, 1, 2, 3, 4>,    // BBlockTransferSrcVectorTensorContiguousDimOrder
-            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
-            5,                          // CThreadTransferSrcDstVectorDim
-            CThreadTransferDstScalarPerVector_BN1,
-            decltype(wei_grid_step_hacks),
-            decltype(in_grid_step_hacks),
-            decltype(out_grid_step_hacks),
-            decltype(wei_grid_move_slice_window_step_hacks),
-            decltype(in_grid_move_slice_window_step_hacks)>(
-            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
-            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
-            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
-            wei_grid_desc_gk0_gm0_gm1_gk1,
-            in_grid_desc_gk0_gn0_gn1_gk1,
-            out_grid_desc_gm0_gm1_gn0_gn1,
-            wei_grid_step_hacks,
-            in_grid_step_hacks,
-            out_grid_step_hacks,
-            wei_grid_move_slice_window_step_hacks,
-            in_grid_move_slice_window_step_hacks,
-            nrepeat);
-        float perf = static_cast<float>(calculate_convolution_flops(
-                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          ck::ActivTypeEnum activ_type,
-          typename InLengths,
-          typename WeiLengths,
-          typename MaxLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
-    const InLengths& in_n_c0_hi_wi_c1_lengths,
-    const WeiLengths& wei_k_c0_y_x_c1_lengths,
-    const MaxLengths& max_n_k0_hx_wx_k1_lengths,
-    const OutLengths& out_n_k0_ho_wo_k1_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
-    const Tensor<TInWei>& wei_k_c0_y_x_c1,
-    const Tensor<TOut>& bias_k0_k1,
-    Tensor<TOut>& out_n_k0_ho_wo_k1,
-    Tensor<TOut>& max_n_k0_hx_wx_k1,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
-    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
-    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
-    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
-    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
-    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
-    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
-    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
-    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
-    const auto K = wei_k_c0_y_x_c1_lengths[I0];
-    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
-    const auto X = wei_k_c0_y_x_c1_lengths[I3];
-    const auto Hx = max_n_k0_hx_wx_k1_lengths[I2];
-    const auto Wx = max_n_k0_hx_wx_k1_lengths[I3];
-    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
-                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
-    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
-    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
-    DeviceMem max_n_k0_hx_wx_k1_device_buf(sizeof(TOut) *
-                                           max_n_k0_hx_wx_k1.mDesc.GetElementSpace());
-    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
-    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
-    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
-    max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
-    constexpr index_t InWeiVectorSize = 8;
-    if(C1 % InWeiVectorSize != 0)
-    {
-        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
-    }
-#if 0
-    constexpr index_t BlockSize = 256;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 64;
-    constexpr index_t E1        = C0 * 9;
-    constexpr index_t E2        = 1;
-    constexpr index_t E1PerBlock = C0;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = 1;
-    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
-    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
-#elif 1
-    constexpr index_t BlockSize = 64;
-    constexpr index_t KPerBlock  = 8;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t E1         = 2 * 9;
-    constexpr index_t E2         = 1;
-    constexpr index_t K2         = 2;
-    constexpr index_t E1PerBlock = 2;
-    constexpr index_t KPerThread  = KPerBlock;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = 1;
-    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
-        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
-    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
-    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
-    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
-#endif
-    if(KPerThread % InWeiVectorSize != 0)
-    {
-        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
-    }
-    const auto in_n_c0_hi_wi_c1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
-    const auto wei_k_c0_y_x_c1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
-    const auto max_n_k0_hx_wx_k1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1));
-    const auto out_n_k0_ho_wo_k1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
-    constexpr auto conv_driver =
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool<
-            BlockSize,
-            typename vector_type<TInWei, InWeiVectorSize>::type,
-            TAcc,
-            TOut,
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            BThreadTransferSrcScalarPerVector_E2,
-            CThreadTransferDstScalarPerVector_K,
-            activ_type>{};
-    std::cerr << "conv_bias_activ_maxpool_input_"
-              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
-              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
-              << "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
-              << Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
-    for(int i = 0; i < 5; i++)
-    {
-        const auto ave_time =
-            conv_driver.Run(wei_k_c0_y_x_c1_desc,
-                            in_n_c0_hi_wi_c1_desc,
-                            out_n_k0_ho_wo_k1_desc,
-                            max_n_k0_hx_wx_k1_desc,
-                            conv_strides,
-                            conv_dilations,
-                            in_left_pads,
-                            in_right_pads,
-                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
-                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
-                            static_cast<TOut*>(max_n_k0_hx_wx_k1_device_buf.GetDeviceBuffer()),
-                            nrepeat);
-        {
-            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-    }
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
-    max_n_k0_hx_wx_k1_device_buf.FromDevice(max_n_k0_hx_wx_k1.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_mn.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_mn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_km_kn_mn(const Tensor<ABType>& a_k_m,
-                                 const Tensor<ABType>& b_k_n,
-                                 Tensor<CType>& c_m_n,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
-    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4], C = 128, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 64, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 1;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 1;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto K = a_k_m.mDesc.GetLengths()[0];
-    const auto M = a_k_m.mDesc.GetLengths()[1];
-    const auto N = b_k_n.mDesc.GetLengths()[1];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
-                                                a_k_m.mDesc.GetStrides()[1],
-                                                a_k_m.mDesc.GetStrides()[0]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
-                                                b_k_n.mDesc.GetStrides()[1],
-                                                b_k_n.mDesc.GetStrides()[0]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    ABlockTransferSrcScalarPerVector_M,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    BBlockTransferSrcScalarPerVector_N,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
-                                    7,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false, // CAccessOrderMRepeatNRepeat
-                                    true,  // ABlockLdsExtraM
-                                    true   // BBlockLdsExtraN
-                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      debug::debug_driver_gemm_xdlops_v2r3::M01,
-                                      debug::debug_driver_gemm_xdlops_v2r3::N01,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_km_kn_nm(const Tensor<ABType>& a_k_m,
-                                 const Tensor<ABType>& b_k_n,
-                                 Tensor<CType>& c_n_m,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
-    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#endif
-    const auto K = a_k_m.mDesc.GetLengths()[0];
-    const auto M = a_k_m.mDesc.GetLengths()[1];
-    const auto N = b_k_n.mDesc.GetLengths()[1];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
-                                                a_k_m.mDesc.GetStrides()[1],
-                                                a_k_m.mDesc.GetStrides()[0]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
-                                                b_k_n.mDesc.GetStrides()[1],
-                                                b_k_n.mDesc.GetStrides()[0]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    ABlockTransferSrcScalarPerVector_M,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    BBlockTransferSrcScalarPerVector_N,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                    6,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
-                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_km_nk_mn(const Tensor<ABType>& a_k_m,
-                                 const Tensor<ABType>& b_n_k,
-                                 Tensor<CType>& c_m_n,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
-    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
-    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
-    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64,  for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 64, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128,  for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64,  for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 32,  for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32,  for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto K = a_k_m.mDesc.GetLengths()[0];
-    const auto M = a_k_m.mDesc.GetLengths()[1];
-    const auto N = b_n_k.mDesc.GetLengths()[0];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
-                                                a_k_m.mDesc.GetStrides()[1],
-                                                a_k_m.mDesc.GetStrides()[0]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
-                                                b_n_k.mDesc.GetStrides()[0],
-                                                b_n_k.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    ABlockTransferSrcScalarPerVector_M,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    BBlockTransferSrcScalarPerVector_K1,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
-                                    7,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false, // CAccessOrderMRepeatNRepeat
-                                    true,  // ABlockLdsExtraM
-                                    true   // BBlockLdsExtraN
-                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      debug::debug_driver_gemm_xdlops_v2r3::M01,
-                                      debug::debug_driver_gemm_xdlops_v2r3::N01,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_km_nk_nm(const Tensor<ABType>& a_k_m,
-                                 const Tensor<ABType>& b_n_k,
-                                 Tensor<CType>& c_n_m,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_k_m_device_buf(sizeof(ABType) * a_k_m.mDesc.GetElementSpace());
-    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
-    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
-    a_k_m_device_buf.ToDevice(a_k_m.mData.data());
-    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
-    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#endif
-    const auto K = a_k_m.mDesc.GetLengths()[0];
-    const auto M = a_k_m.mDesc.GetLengths()[1];
-    const auto N = b_n_k.mDesc.GetLengths()[0];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_k_m.mDesc.GetStrides()[0],
-                                                a_k_m.mDesc.GetStrides()[1],
-                                                a_k_m.mDesc.GetStrides()[0]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
-                                                b_n_k.mDesc.GetStrides()[0],
-                                                b_n_k.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    ABlockTransferSrcScalarPerVector_M,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    BBlockTransferSrcScalarPerVector_K1,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                    6,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
-                                    >(static_cast<ABType*>(a_k_m_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
-                                 const Tensor<ABType>& b_k_n,
-                                 Tensor<CType>& c_m_n,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64,  for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 64, 4, 4], C = 32,  for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 1;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32,  for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128,  for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64,  for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 1;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto K = a_m_k.mDesc.GetLengths()[1];
-    const auto M = a_m_k.mDesc.GetLengths()[0];
-    const auto N = b_k_n.mDesc.GetLengths()[1];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
-                                                a_m_k.mDesc.GetStrides()[0],
-                                                a_m_k.mDesc.GetStrides()[1]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
-                                                b_k_n.mDesc.GetStrides()[1],
-                                                b_k_n.mDesc.GetStrides()[0]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    ABlockTransferSrcScalarPerVector_K1,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    BBlockTransferSrcScalarPerVector_N,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
-                                    7,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false, // CAccessOrderMRepeatNRepeat
-                                    true,  // ABlockLdsExtraM
-                                    true   // BBlockLdsExtraN
-                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      debug::debug_driver_gemm_xdlops_v2r3::M01,
-                                      debug::debug_driver_gemm_xdlops_v2r3::N01,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_mk_kn_nm(const Tensor<ABType>& a_m_k,
-                                 const Tensor<ABType>& b_k_n,
-                                 Tensor<CType>& c_n_m,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(ABType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_N  = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#endif
-    const auto K = a_m_k.mDesc.GetLengths()[1];
-    const auto M = a_m_k.mDesc.GetLengths()[0];
-    const auto N = b_k_n.mDesc.GetLengths()[1];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
-                                                a_m_k.mDesc.GetStrides()[0],
-                                                a_m_k.mDesc.GetStrides()[1]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_k_n.mDesc.GetStrides()[0],
-                                                b_k_n.mDesc.GetStrides()[1],
-                                                b_k_n.mDesc.GetStrides()[0]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    ABlockTransferSrcScalarPerVector_K1,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<0, 2, 1>,
-                                    Sequence<0, 2, 1>,
-                                    1,
-                                    BBlockTransferSrcScalarPerVector_N,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                    6,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
-                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_mk_nk_mn(const Tensor<ABType>& a_m_k,
-                                 const Tensor<ABType>& b_n_k,
-                                 Tensor<CType>& c_m_n,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CType) * c_m_n.mDesc.GetElementSpace());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 4], C = 64, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 64, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 4], C = 32, for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 0
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [128, 64, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 64;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 1;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 1, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#elif 1
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 1;
-#endif
-    const auto K = a_m_k.mDesc.GetLengths()[1];
-    const auto M = a_m_k.mDesc.GetLengths()[0];
-    const auto N = b_n_k.mDesc.GetLengths()[0];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-#if 1
-    // non-padded GEMM
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
-                                                a_m_k.mDesc.GetStrides()[0],
-                                                a_m_k.mDesc.GetStrides()[1]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
-                                                b_n_k.mDesc.GetStrides()[0],
-                                                b_n_k.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-#else
-    // padded GEMM
-    const auto a_k0_m_k1_grid_desc_tmp =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
-                                                a_m_k.mDesc.GetStrides()[0],
-                                                a_m_k.mDesc.GetStrides()[1]));
-    const auto MRightPad = math::integer_divide_ceil(M, MPerBlock) * MPerBlock - M;
-    const auto a_k0_m_k1_grid_desc =
-        transform_tensor_descriptor(a_k0_m_k1_grid_desc_tmp,
-                                    make_tuple(make_pass_through_transform(K0),
-                                               make_right_pad_transform(M, MRightPad),
-                                               make_pass_through_transform(K1Number)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
-                                                b_n_k.mDesc.GetStrides()[0],
-                                                b_n_k.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc_tmp = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_m_n.mDesc.GetStrides()[0], c_m_n.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc = transform_tensor_descriptor(
-        c_m_n_grid_desc_tmp,
-        make_tuple(make_right_pad_transform(M, MRightPad), make_pass_through_transform(N)),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0>{},   // 0+: K0
-                              Sequence<0, 0, 0, 0>{},   // 1+: M
-                              Sequence<0, 0, 0, 0>{}),  // 2+: K1
-                   make_tuple(Sequence<0, 0, 0, 0>{},   // 0-: K0
-                              Sequence<0, 0, 0, 0>{},   // 1-: M
-                              Sequence<0, 0, 0, 0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-#endif
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    ABlockTransferSrcScalarPerVector_K1,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    BBlockTransferSrcScalarPerVector_K1,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
-                                    7,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false, // CAccessOrderMRepeatNRepeat
-                                    true,  // ABlockLdsExtraM
-                                    true   // BBlockLdsExtraN
-                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      debug::debug_driver_gemm_xdlops_v2r3::M01,
-                                      debug::debug_driver_gemm_xdlops_v2r3::N01,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_m_n_device_buf.FromDevice(c_m_n.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp
+++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-template <typename ABType, typename AccType, typename CType>
-void device_gemm_xdlops_mk_nk_nm(const Tensor<ABType>& a_m_k,
-                                 const Tensor<ABType>& b_n_k,
-                                 Tensor<CType>& c_n_m,
-                                 ck::index_t nrepeat)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ABType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_n_k_device_buf(sizeof(ABType) * b_n_k.mDesc.GetElementSpace());
-    DeviceMem c_n_m_device_buf(sizeof(CType) * c_n_m.mDesc.GetElementSpace());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_n_k_device_buf.ToDevice(b_n_k.mData.data());
-    c_n_m_device_buf.ToDevice(c_n_m.mData.data());
-#if 0
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 4;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 4>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 4;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 4>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 4;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 4;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 256;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 256;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 4;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 128, for fp16
-    constexpr index_t BlockSize = 128;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 32, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 4, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 32, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 128;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 2, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [64, 128, 4, 8], C = 32, for fp16
-    constexpr index_t BlockSize = 256;
-    constexpr index_t MPerBlock = 64;
-    constexpr index_t NPerBlock = 128;
-    constexpr index_t KPerBlock = 4;
-    constexpr index_t MPerXDL = 32;
-    constexpr index_t NPerXDL = 32;
-    constexpr index_t K1      = 8;
-    constexpr index_t MRepeat = 1;
-    constexpr index_t NRepeat = 2;
-    using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 1, 8>;
-    using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
-    using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
-    using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
-    constexpr index_t BBlockTransferSrcScalarPerVector_K1 = 8;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
-    constexpr index_t CThreadTransferDstScalarPerVector = 4;
-#endif
-    const auto K = a_m_k.mDesc.GetLengths()[1];
-    const auto M = a_m_k.mDesc.GetLengths()[0];
-    const auto N = b_n_k.mDesc.GetLengths()[0];
-    constexpr auto K1Number = Number<K1>{};
-    const auto K0           = K / K1Number;
-    const auto a_k0_m_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, M, K1Number),
-                                     make_tuple(K1Number * a_m_k.mDesc.GetStrides()[1],
-                                                a_m_k.mDesc.GetStrides()[0],
-                                                a_m_k.mDesc.GetStrides()[1]));
-    const auto b_k0_n_k1_grid_desc =
-        make_naive_tensor_descriptor(make_tuple(K0, N, K1Number),
-                                     make_tuple(K1Number * b_n_k.mDesc.GetStrides()[1],
-                                                b_n_k.mDesc.GetStrides()[0],
-                                                b_n_k.mDesc.GetStrides()[1]));
-    const auto c_m_n_grid_desc = make_naive_tensor_descriptor(
-        make_tuple(M, N), make_tuple(c_n_m.mDesc.GetStrides()[1], c_n_m.mDesc.GetStrides()[0]));
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto a_k0_m_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: M
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: M
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto b_k0_n_k1_grid_step_hacks = make_tuple(make_tuple(Sequence<0>{},   // 0+: K0
-                                                                     Sequence<0>{},   // 1+: N
-                                                                     Sequence<0>{}),  // 2+: K1
-                                                          make_tuple(Sequence<0>{},   // 0-: K0
-                                                                     Sequence<0>{},   // 1-: N
-                                                                     Sequence<0>{})); // 2-: K1
-    constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
-    constexpr auto a_k0_m_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    constexpr auto b_k0_n_k1_grid_move_slice_window_step_hacks = Sequence<0>{};
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time =
-            driver_gemm_xdlops_v2r3<BlockSize,
-                                    ABType,
-                                    AccType,
-                                    CType,
-                                    InMemoryDataOperationEnum::Set,
-                                    decltype(a_k0_m_k1_grid_desc),
-                                    decltype(b_k0_n_k1_grid_desc),
-                                    decltype(c_m_n_grid_desc),
-                                    MPerBlock,
-                                    NPerBlock,
-                                    KPerBlock,
-                                    MPerXDL,
-                                    NPerXDL,
-                                    K1,
-                                    MRepeat,
-                                    NRepeat,
-                                    ABlockTransferThreadSliceLengths_K0_M_K1,
-                                    ABlockTransferThreadClusterLengths_K0_M_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    ABlockTransferSrcScalarPerVector_K1,
-                                    ABlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    BBlockTransferThreadSliceLengths_K0_N_K1,
-                                    BBlockTransferThreadClusterLengths_K0_N_K1,
-                                    Sequence<1, 0, 2>,
-                                    Sequence<1, 0, 2>,
-                                    2,
-                                    BBlockTransferSrcScalarPerVector_K1,
-                                    BBlockTransferDstScalarPerVector_K1,
-                                    false, // don't move back src coordinate after threadwise copy
-                                    Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-                                    6,
-                                    CThreadTransferDstScalarPerVector,
-                                    decltype(a_k0_m_k1_grid_step_hacks),
-                                    decltype(b_k0_n_k1_grid_step_hacks),
-                                    decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
-                                    decltype(a_k0_m_k1_grid_move_slice_window_step_hacks),
-                                    decltype(b_k0_n_k1_grid_move_slice_window_step_hacks),
-                                    false // CAccessOrderMRepeatNRepeat
-                                    >(static_cast<ABType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<ABType*>(b_n_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<CType*>(c_n_m_device_buf.GetDeviceBuffer()),
-                                      a_k0_m_k1_grid_desc,
-                                      b_k0_n_k1_grid_desc,
-                                      c_m_n_grid_desc,
-                                      a_k0_m_k1_grid_step_hacks,
-                                      b_k0_n_k1_grid_step_hacks,
-                                      c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
-                                      a_k0_m_k1_grid_move_slice_window_step_hacks,
-                                      b_k0_n_k1_grid_move_slice_window_step_hacks,
-                                      nrepeat);
-        float perf = static_cast<float>((std::size_t(2) * M * N * K)) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time;
-        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
-    }
-    // copy result back to host
-    c_n_m_device_buf.FromDevice(c_n_m.mData.data());
-}
--- a/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp
-#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP
-#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_contraction_dlops_v1r2.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_GK0_GM0_GM1_GK1,
-          typename BGridDesc_GK0_GN0_GN1_GK1,
-          typename CGridDesc_GM0_GM1_GN0_GN1,
-          ck::index_t GM1PerBlockGM11,
-          ck::index_t GN1PerBlockGN11,
-          ck::index_t GK0PerBlock,
-          ck::index_t BM1PerThreadBM11,
-          ck::index_t BN1PerThreadBN11,
-          ck::index_t BK0PerThread,
-          typename BM10BN10ThreadClusterBM10Xs,
-          typename BM10BN10ThreadClusterBN10Xs,
-          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
-          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-__host__ float
-driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
-                              const FloatAB* p_b_grid,
-                              FloatC* p_c_grid,
-                              const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
-                              const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
-                              const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
-                              AGridStepHacks,
-                              BGridStepHacks,
-                              CGridStepHacks,
-                              AGridMoveSliceWindowStepHacks,
-                              BGridMoveSliceWindowStepHacks,
-                              ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    // GEMM
-    using GridwiseContraction =
-        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            CGlobalMemoryDataOperation,
-            AGridDesc_GK0_GM0_GM1_GK1,
-            BGridDesc_GK0_GN0_GN1_GK1,
-            CGridDesc_GM0_GM1_GN0_GN1,
-            GM1PerBlockGM11,
-            GN1PerBlockGN11,
-            GK0PerBlock,
-            BM1PerThreadBM11,
-            BN1PerThreadBN11,
-            BK0PerThread,
-            BM10BN10ThreadClusterBM10Xs,
-            BM10BN10ThreadClusterBN10Xs,
-            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            ABlockTransferSrcVectorTensorContiguousDimOrder,
-            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferThreadClusterArrangeOrder,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            BBlockTransferSrcVectorTensorContiguousDimOrder,
-            CThreadTransferSrcDstAccessOrder,
-            CThreadTransferSrcDstVectorDim,
-            CThreadTransferDstScalarPerVector,
-            AGridStepHacks,
-            BGridStepHacks,
-            CGridStepHacks,
-            AGridMoveSliceWindowStepHacks,
-            BGridMoveSliceWindowStepHacks>;
-    const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
-    if(!GridwiseContraction::CheckValidity(
-           a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1))
-    {
-        throw std::runtime_error("wrong! "
-                                 "GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
-                                 "GM0_GM1_GN0_GN1 has invalid setting");
-    }
-    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 =
-        GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(a_grid_desc_gk0_gm0_gm1_gk1);
-    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 =
-        GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(b_grid_desc_gk0_gn0_gn1_gk1);
-    using AGridDesc_GK0_GM0_GM10_GM11_GK1 = decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1);
-    using BGridDesc_GK0_GN0_GN10_GN11_GK1 = decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1);
-    // c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
-    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
-        GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
-            c_grid_desc_gm0_gm1_gn0_gn1);
-    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 = decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1);
-    // c_grid_block_cluster_blockid_to_gm10_gn10
-    const auto c_grid_block_cluster_blockid_to_gm10_gn10 =
-        GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
-            c_grid_desc_gm0_gm1_gn0_gn1);
-    using CGridBlockCluster_BlockId_To_GM10_GN10 =
-        decltype(c_grid_block_cluster_blockid_to_gm10_gn10);
-    const index_t grid_size = GridwiseContraction::CalculateGridSize(c_grid_desc_gm0_gm1_gn0_gn1);
-    const bool has_main_k_block_loop = GridwiseContraction::CalculateHasMainKBlockLoop(GK0);
-    const bool has_double_tail_k_block_loop =
-        GridwiseContraction::CalculateHasDoubleTailKBlockLoop(GK0);
-    {
-        std::cout << "a_grid_desc_gk0_gm0_gm10_gm11_gk1{"
-                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0) << ", "
-                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I1) << ", "
-                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I2) << ", "
-                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I3) << ", "
-                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I4) << "}" << std::endl;
-        std::cout << "b_grid_desc_gk0_gn0_gn10_gn11_gk1{"
-                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I0) << ", "
-                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I1) << ", "
-                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I2) << ", "
-                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I3) << ", "
-                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I4) << "}" << std::endl;
-        std::cout << "c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1{ "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I0) << ", "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I1) << ", "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I2) << ", "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I3) << ", "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I4) << ", "
-                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I5) << "}" << std::endl;
-    }
-    float ave_time = 0;
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel = kernel_contraction_dlops_v1r2<
-            GridwiseContraction,
-            FloatAB,
-            FloatC,
-            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
-            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
-            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
-            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
-            true,
-            true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                                          c_grid_block_cluster_blockid_to_gm10_gn10);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel = kernel_contraction_dlops_v1r2<
-            GridwiseContraction,
-            FloatAB,
-            FloatC,
-            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
-            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
-            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
-            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
-            true,
-            false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                                          c_grid_block_cluster_blockid_to_gm10_gn10);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel = kernel_contraction_dlops_v1r2<
-            GridwiseContraction,
-            FloatAB,
-            FloatC,
-            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
-            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
-            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
-            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
-            false,
-            true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                                          c_grid_block_cluster_blockid_to_gm10_gn10);
-    }
-    else
-    {
-        const auto kernel = kernel_contraction_dlops_v1r2<
-            GridwiseContraction,
-            FloatAB,
-            FloatC,
-            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
-            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
-            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
-            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
-            false,
-            false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
-                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
-                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
-                                          c_grid_block_cluster_blockid_to_gm10_gn10);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#ifndef DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#define DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t E1_,
-          ck::index_t E2_,
-          ck::index_t K2_,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t E1PerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          ck::index_t ABlockTransferSrcScalarPerVector_E2,
-          ck::index_t ABlockTransferDstScalarPerVector_E2,
-          ck::index_t BThreadTransferSrcScalarPerVector_E2,
-          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum activ_type>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
-{
-    template <typename... Wei,
-              typename... In,
-              typename... Add,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
-                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
-                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                       const ck::TensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
-                       const ConvStrides& conv_strides,
-                       const ConvDilations& conv_dilations,
-                       const InLeftPads& in_left_pads,
-                       const InRightPads& in_right_pads,
-                       const FloatAB* __restrict__ p_a_grid,
-                       const FloatAB* __restrict__ p_b_grid,
-                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_d_grid,
-                       const int nrepeat) const
-    {
-        using namespace ck;
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
-        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
-        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
-        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
-        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-        const auto Hox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I2);
-        const auto Wox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I3);
-        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
-        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
-        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto OutRightPadHx = OutRightPadH * 2;
-        const auto OutRightPadWx = OutRightPadW * 2;
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-        const auto E = C0 * Y * X;
-        constexpr auto E1 = Number<E1_>{};
-        constexpr auto E2 = Number<E2_>{};
-        constexpr auto K2 = Number<K2_>{};
-        const auto E0 = E / E1;
-        // weight tensor
-        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
-            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
-        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_e2_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                                                   make_pass_through_transform(K),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
-        // input tensor
-        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C0),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
-            in_n_c0_hip_wip_e2_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C0),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
-        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_n_c0_y_ho_x_wo_e2_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(
-                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_e_n_ho_wo_e2_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
-        // output tensor
-        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        // add tensor
-        const auto d_k_n_hopx2_wopx2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Hox2, I0, OutRightPadHx),
-                       make_pad_transform(Wox2, I0, OutRightPadWx)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E1 % E1PerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-        // clang-format off
-        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
-        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
-            make_tuple(
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
-            ); 
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        // clang-format on
-        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum::Set,
-            decltype(a_e0_e1_k_e2_grid_desc),
-            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            decltype(d_k_n_hopx2_wopx2_grid_desc),
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            Sequence<2, 3, 0, 1, 4>,
-            Sequence<0, 1, 2, 3, 4>,
-            4,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
-            9,
-            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
-                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
-            1,
-            CThreadTransferDstScalarPerVector_K,
-            decltype(a_e0_e1_k_e2_global_step_hacks),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks),
-            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
-        const auto a_e0_e1_k0_k1_e2_grid_desc =
-            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
-            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
-        const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
-            GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(
-                d_k_n_hopx2_wopx2_grid_desc);
-        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
-        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
-        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-        using DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2 =
-            decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
-        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-        const bool has_main_e0_block_loop = E0 > 1;
-        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
-        const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
-            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
-        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-            decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        float ave_time = 0;
-        if(has_main_e0_block_loop)
-        {
-            const auto kernel = kernel_gemm_dlops_v3_resize_add<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                true,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        else
-        {
-            const auto kernel = kernel_gemm_dlops_v3_resize_add<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                false,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        return ave_time;
-    }
-};
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t E1_,
-          ck::index_t E2_,
-          ck::index_t K2_,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t E1PerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          ck::index_t ABlockTransferSrcScalarPerVector_E2,
-          ck::index_t ABlockTransferDstScalarPerVector_E2,
-          ck::index_t BThreadTransferSrcScalarPerVector_E2,
-          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum activ_type>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
-{
-    template <typename... Wei,
-              typename... In,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
-                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
-                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                       const ConvStrides& conv_strides,
-                       const ConvDilations& conv_dilations,
-                       const InLeftPads& in_left_pads,
-                       const InRightPads& in_right_pads,
-                       const FloatAB* __restrict__ p_a_grid,
-                       const FloatAB* __restrict__ p_b_grid,
-                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_c_grid,
-                       const int nrepeat) const
-    {
-        using namespace ck;
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
-        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
-        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
-        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
-        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
-        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
-        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
-        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
-#else
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-#endif
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-        const auto E = C0 * Y * X;
-        constexpr auto E1 = Number<E1_>{};
-        constexpr auto E2 = Number<E2_>{};
-        constexpr auto K2 = Number<K2_>{};
-        const auto E0 = E / E1;
-        // weight tensor
-        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
-            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
-        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_e2_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                                                   make_pass_through_transform(K),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
-        // input tensor
-        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C0),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
-            in_n_c0_hip_wip_e2_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C0),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
-        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_n_c0_y_ho_x_wo_e2_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(
-                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_e_n_ho_wo_e2_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
-        // output tensor
-        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E1 % E1PerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-        // clang-format off
-        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
-        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
-            make_tuple(
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
-            ); 
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        // clang-format on
-        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum::Set,
-            decltype(a_e0_e1_k_e2_grid_desc),
-            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            Sequence<2, 3, 0, 1, 4>,
-            Sequence<0, 1, 2, 3, 4>,
-            4,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
-            9,
-            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
-                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, H2, W0, W1, W2
-            1,
-            CThreadTransferDstScalarPerVector_K,
-            decltype(a_e0_e1_k_e2_global_step_hacks),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
-        const auto a_e0_e1_k0_k1_e2_grid_desc =
-            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
-            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
-        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
-        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
-        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-        const bool has_main_e0_block_loop = E0 > 1;
-        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
-        const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
-            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
-        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-            decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        float ave_time = 0;
-        if(has_main_e0_block_loop)
-        {
-            const auto kernel =
-                kernel_gemm_dlops_v3<GridwiseGemm,
-                                     FloatAB,
-                                     FloatC,
-                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                                     true,
-                                     activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        else
-        {
-            const auto kernel =
-                kernel_gemm_dlops_v3<GridwiseGemm,
-                                     FloatAB,
-                                     FloatC,
-                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                                     false,
-                                     activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        return ave_time;
-    }
-};
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t E1_,
-          ck::index_t E2_,
-          ck::index_t K2_,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t E1PerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          ck::index_t ABlockTransferSrcScalarPerVector_E2,
-          ck::index_t ABlockTransferDstScalarPerVector_E2,
-          ck::index_t BThreadTransferSrcScalarPerVector_E2,
-          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum activ_type>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
-{
-    template <typename... Wei,
-              typename... In,
-              typename... MaxPool,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
-                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
-                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                       const ck::TensorDescriptor<MaxPool...>& max_n_k0_hx_wx_k1_global_desc,
-                       const ConvStrides& conv_strides,
-                       const ConvDilations& conv_dilations,
-                       const InLeftPads& in_left_pads,
-                       const InRightPads& in_right_pads,
-                       const FloatAB* __restrict__ p_a_grid,
-                       const FloatAB* __restrict__ p_b_grid,
-                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_c_grid,
-                       FloatC* __restrict__ p_d_grid,
-                       const int nrepeat) const
-    {
-        using namespace ck;
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
-        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
-        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
-        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
-        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-        const auto Hx = max_n_k0_hx_wx_k1_global_desc.GetLength(I2);
-        const auto Wx = max_n_k0_hx_wx_k1_global_desc.GetLength(I3);
-        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
-        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
-        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
-        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto OutRightPadHx = Number<OutRightPadH / 2>{};
-        const auto OutRightPadWx = Number<OutRightPadW / 2>{};
-#else
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto OutRightPadHx = OutRightPadH / 2;
-        const auto OutRightPadWx = OutRightPadW / 2;
-#endif
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-        const auto E = C0 * Y * X;
-        constexpr auto E1 = Number<E1_>{};
-        constexpr auto E2 = Number<E2_>{};
-        constexpr auto K2 = Number<K2_>{};
-        const auto E0 = E / E1;
-        // weight tensor
-        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
-            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
-        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_e2_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                                                   make_pass_through_transform(K),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
-        // input tensor
-        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C0),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
-            in_n_c0_hip_wip_e2_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C0),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
-        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_n_c0_y_ho_x_wo_e2_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(
-                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_e_n_ho_wo_e2_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
-        // output tensor
-        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        // max tensor
-        const auto d_k_n_hx_wx_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Hx, I0, OutRightPadHx),
-                       make_pad_transform(Wx, I0, OutRightPadWx)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E1 % E1PerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-        // clang-format off
-        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
-        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
-            make_tuple(
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
-            ); 
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        // clang-format on
-        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum::Set,
-            decltype(a_e0_e1_k_e2_grid_desc),
-            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            decltype(d_k_n_hx_wx_grid_desc),
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            Sequence<2, 3, 0, 1, 4>,
-            Sequence<0, 1, 2, 3, 4>,
-            4,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
-            9,
-            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused
-                   // with MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
-            1,
-            CThreadTransferDstScalarPerVector_K,
-            decltype(a_e0_e1_k_e2_global_step_hacks),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks),
-            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
-        const auto a_e0_e1_k0_k1_e2_grid_desc =
-            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
-            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
-        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
-            GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
-        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
-        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
-        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-        using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
-        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-        const bool has_main_e0_block_loop = E0 > 1;
-        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
-        const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
-            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
-        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-            decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        float ave_time = 0;
-        if(has_main_e0_block_loop)
-        {
-            const auto kernel = kernel_gemm_dlops_v3_maxpool<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                true,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        else
-        {
-            const auto kernel = kernel_gemm_dlops_v3_maxpool<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                false,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        return ave_time;
-    }
-};
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
-#ifndef DRIVER_GEMM_DLOPS_V1R2
-#define DRIVER_GEMM_DLOPS_V1R2
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v1r2.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AKMGridDesc,
-          typename BKNGridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          ck::index_t M1N1ThreadClusterM10,
-          ck::index_t M1N1ThreadClusterN10,
-          ck::index_t M1N1ThreadClusterM11,
-          ck::index_t M1N1ThreadClusterN11,
-          typename ABlockTransferThreadSliceLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_M1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_N1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AKMGridDesc& a_k_m_grid_desc,
-                                      const BKNGridDesc& b_k_n_grid_desc,
-                                      const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridStepHacks,
-                                      BGridStepHacks,
-                                      CGridStepHacks,
-                                      AGridMoveSliceWindowStepHacks,
-                                      BGridMoveSliceWindowStepHacks,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    // GEMM
-    using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                                         FloatAB,
-                                                         FloatAcc,
-                                                         FloatC,
-                                                         CGlobalMemoryDataOperation,
-                                                         AKMGridDesc,
-                                                         BKNGridDesc,
-                                                         CMNGridDesc,
-                                                         MPerBlock,
-                                                         NPerBlock,
-                                                         KPerBlock,
-                                                         M1PerThread,
-                                                         N1PerThread,
-                                                         KPerThread,
-                                                         M1N1ThreadClusterM10,
-                                                         M1N1ThreadClusterN10,
-                                                         M1N1ThreadClusterM11,
-                                                         M1N1ThreadClusterN11,
-                                                         ABlockTransferThreadSliceLengths_K_M0_M1,
-                                                         ABlockTransferThreadClusterLengths_K_M0_M1,
-                                                         ABlockTransferThreadClusterArrangeOrder,
-                                                         ABlockTransferSrcAccessOrder,
-                                                         ABlockTransferSrcVectorDim,
-                                                         ABlockTransferSrcScalarPerVector,
-                                                         ABlockTransferDstScalarPerVector_M1,
-                                                         AThreadTransferSrcResetCoordinateAfterRun,
-                                                         BBlockTransferThreadSliceLengths_K_N0_N1,
-                                                         BBlockTransferThreadClusterLengths_K_N0_N1,
-                                                         BBlockTransferThreadClusterArrangeOrder,
-                                                         BBlockTransferSrcAccessOrder,
-                                                         BBlockTransferSrcVectorDim,
-                                                         BBlockTransferSrcScalarPerVector,
-                                                         BBlockTransferDstScalarPerVector_N1,
-                                                         BThreadTransferSrcResetCoordinateAfterRun,
-                                                         CThreadTransferSrcDstAccessOrder,
-                                                         CThreadTransferSrcDstVectorDim,
-                                                         CThreadTransferDstScalarPerVector,
-                                                         AGridStepHacks,
-                                                         BGridStepHacks,
-                                                         CGridStepHacks,
-                                                         AGridMoveSliceWindowStepHacks,
-                                                         BGridMoveSliceWindowStepHacks>;
-    const auto M = a_k_m_grid_desc.GetLength(I1);
-    const auto N = b_k_n_grid_desc.GetLength(I1);
-    const auto K = a_k_m_grid_desc.GetLength(I0);
-    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
-    }
-    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    const auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc);
-    using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc);
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-    // cblockid_to_m0_n0_block_cluster_adaptor
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K);
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K);
-    {
-        std::cout << "a_k_m0_m1_grid_desc{" << a_k_m0_m1_grid_desc.GetLength(I0) << ", "
-                  << a_k_m0_m1_grid_desc.GetLength(I1) << ", " << a_k_m0_m1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "b_k_n0_n1_grid_desc{" << b_k_n0_n1_grid_desc.GetLength(I0) << ", "
-                  << b_k_n0_n1_grid_desc.GetLength(I1) << ", " << b_k_n0_n1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-    float ave_time = 0;
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
-#ifndef DRIVER_GEMM_DLOPS_V1R3
-#define DRIVER_GEMM_DLOPS_V1R3
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v1r3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AK0MK1GridDesc,
-          typename BK0NK1GridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          typename M1N1ThreadClusterM1Xs,
-          typename M1N1ThreadClusterN1Xs,
-          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
-          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
-          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                      const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                      const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridStepHacks,
-                                      BGridStepHacks,
-                                      CGridStepHacks,
-                                      AGridMoveSliceWindowStepHacks,
-                                      BGridMoveSliceWindowStepHacks,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    // GEMM
-    using GridwiseGemm =
-        GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
-                                        FloatAB,
-                                        FloatAcc,
-                                        FloatC,
-                                        CGlobalMemoryDataOperation,
-                                        AK0MK1GridDesc,
-                                        BK0NK1GridDesc,
-                                        CMNGridDesc,
-                                        MPerBlock,
-                                        NPerBlock,
-                                        KPerBlock,
-                                        M1PerThread,
-                                        N1PerThread,
-                                        KPerThread,
-                                        M1N1ThreadClusterM1Xs,
-                                        M1N1ThreadClusterN1Xs,
-                                        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-                                        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-                                        ABlockTransferThreadClusterArrangeOrder,
-                                        ABlockTransferSrcAccessOrder,
-                                        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-                                        ABlockTransferSrcVectorTensorContiguousDimOrder,
-                                        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-                                        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-                                        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-                                        BBlockTransferThreadClusterArrangeOrder,
-                                        BBlockTransferSrcAccessOrder,
-                                        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-                                        BBlockTransferSrcVectorTensorContiguousDimOrder,
-                                        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-                                        CThreadTransferSrcDstAccessOrder,
-                                        CThreadTransferSrcDstVectorDim,
-                                        CThreadTransferDstScalarPerVector,
-                                        AGridStepHacks,
-                                        BGridStepHacks,
-                                        CGridStepHacks,
-                                        AGridMoveSliceWindowStepHacks,
-                                        BGridMoveSliceWindowStepHacks>;
-    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
-    const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
-    }
-    const auto a_k0_m0_m1_k1_grid_desc =
-        GridwiseGemm::MakeAK0M0M1K1GridDescriptor(a_k0_m_k1_grid_desc);
-    const auto b_k0_n0_n1_k1_grid_desc =
-        GridwiseGemm::MakeBK0N0N1K1GridDescriptor(b_k0_n_k1_grid_desc);
-    using AK0M0M1K1GridDesc = decltype(a_k0_m0_m1_k1_grid_desc);
-    using BK0N0N1K1GridDesc = decltype(b_k0_n0_n1_k1_grid_desc);
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-    // cblockid_to_m0_n0_block_cluster_adaptor
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
-    {
-        std::cout << "a_k0_m0_m1_k1_grid_desc{" << a_k0_m0_m1_k1_grid_desc.GetLength(I0) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I1) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I2) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "b_k0_n0_n1_k1_grid_desc{" << b_k0_n0_n1_k1_grid_desc.GetLength(I0) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I1) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I2) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-    float ave_time = 0;
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
-#ifndef DRIVER_GEMM_XDLOPS_V2R3_HPP
-#define DRIVER_GEMM_XDLOPS_V2R3_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_v2r3.hpp"
-#include "element_wise_operation.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t MPerXDL,
-          ck::index_t NPerXDL,
-          ck::index_t K1,
-          ck::index_t MRepeat,
-          ck::index_t NRepeat,
-          typename ABlockTransferThreadSliceLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_K1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks,
-          bool CAccessOrderMRepeatNRepeat,
-          bool ABlockLdsAddExtraM,
-          bool BBlockLdsAddExtraN>
-__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
-                                       const FloatAB* p_b_grid,
-                                       FloatC* p_c_grid,
-                                       const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-                                       const BGridDesc_K0_N_K& b_grid_desc_k0_n_k1,
-                                       const CMNGridDesc& c_grid_desc_m_n,
-                                       ck::index_t M01,
-                                       ck::index_t N01,
-                                       AGridStepHacks,
-                                       BGridStepHacks,
-                                       CGridStepHacks,
-                                       AGridMoveSliceWindowStepHacks,
-                                       BGridMoveSliceWindowStepHacks,
-                                       ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    using ElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                CGlobalMemoryDataOperation,
-                                                AGridDesc_K0_M_K1,
-                                                BGridDesc_K0_N_K,
-                                                CMNGridDesc,
-                                                ElementwiseOperation,
-                                                ElementwiseOperation,
-                                                ElementwiseOperation,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerXDL,
-                                                NPerXDL,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                ABlockLdsAddExtraM,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockLdsAddExtraN,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector>;
-    {
-        std::cout << "a_grid_desc_k0_m_k1{" << a_grid_desc_k0_m_k1.GetLength(I0) << ", "
-                  << a_grid_desc_k0_m_k1.GetLength(I1) << ", " << a_grid_desc_k0_m_k1.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "b_grid_desc_k0_n_k1{" << b_grid_desc_k0_n_k1.GetLength(I0) << ", "
-                  << b_grid_desc_k0_n_k1.GetLength(I1) << ", " << b_grid_desc_k0_n_k1.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "c_grid_desc_m_n{ " << c_grid_desc_m_n.GetLength(I0) << ", "
-                  << c_grid_desc_m_n.GetLength(I1) << "}" << std::endl;
-    }
-    if(!GridwiseGemm::CheckValidity(
-           a_grid_desc_k0_m_k1, b_grid_desc_k0_n_k1, c_grid_desc_m_n, M01, N01))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
-    }
-    const auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc =
-        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
-    const auto block_2_ctile_map =
-        GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, M01, N01);
-    using Block2CTileMap = decltype(block_2_ctile_map);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_grid_desc_m_n);
-    const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
-    const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
-    float ave_time = 0;
-    auto element_op_ = ElementwiseOperation{};
-    if(has_main_k0_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_xdlops_v2r3<GridwiseGemm,
-                                    FloatAB,
-                                    FloatC,
-                                    remove_reference_t<AGridDesc_K0_M_K1>,
-                                    remove_reference_t<BGridDesc_K0_N_K>,
-                                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    remove_reference_t<Block2CTileMap>,
-                                    true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_k0_m_k1,
-                                          b_grid_desc_k0_n_k1,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          element_op_,
-                                          element_op_,
-                                          element_op_,
-                                          block_2_ctile_map);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_xdlops_v2r3<GridwiseGemm,
-                                    FloatAB,
-                                    FloatC,
-                                    remove_reference_t<AGridDesc_K0_M_K1>,
-                                    remove_reference_t<BGridDesc_K0_N_K>,
-                                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    remove_reference_t<Block2CTileMap>,
-                                    false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_k0_m_k1,
-                                          b_grid_desc_k0_n_k1,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          element_op_,
-                                          element_op_,
-                                          element_op_,
-                                          block_2_ctile_map);
-    }
-    return ave_time;
-}
-#endif