Merge remote-tracking branch 'origin/develop' into contraction

7a3b49e5 · Chao Liu · e07b3d8e · d3051d75 · e07b3d8e · e07b3d8e
Commit 7a3b49e5 authored Jun 25, 2022 by Chao Liu
20 changed files
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t E1_,
-          ck::index_t E2_,
-          ck::index_t K2_,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t E1PerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          ck::index_t ABlockTransferSrcScalarPerVector_E2,
-          ck::index_t ABlockTransferDstScalarPerVector_E2,
-          ck::index_t BThreadTransferSrcScalarPerVector_E2,
-          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum activ_type>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
-{
-    template <typename... Wei,
-              typename... In,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
-                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
-                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                       const ConvStrides& conv_strides,
-                       const ConvDilations& conv_dilations,
-                       const InLeftPads& in_left_pads,
-                       const InRightPads& in_right_pads,
-                       const FloatAB* __restrict__ p_a_grid,
-                       const FloatAB* __restrict__ p_b_grid,
-                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_c_grid,
-                       const int nrepeat) const
-    {
-        using namespace ck;
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
-        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
-        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
-        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
-        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
-        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
-        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
-        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
-#else
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-#endif
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-        const auto E = C0 * Y * X;
-        constexpr auto E1 = Number<E1_>{};
-        constexpr auto E2 = Number<E2_>{};
-        constexpr auto K2 = Number<K2_>{};
-        const auto E0 = E / E1;
-        // weight tensor
-        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
-            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
-        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_e2_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                                                   make_pass_through_transform(K),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
-        // input tensor
-        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C0),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
-            in_n_c0_hip_wip_e2_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C0),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
-        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_n_c0_y_ho_x_wo_e2_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(
-                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_e_n_ho_wo_e2_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
-        // output tensor
-        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E1 % E1PerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-        // clang-format off
-        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
-        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
-            make_tuple(
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
-            ); 
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        // clang-format on
-        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum::Set,
-            decltype(a_e0_e1_k_e2_grid_desc),
-            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            Sequence<2, 3, 0, 1, 4>,
-            Sequence<0, 1, 2, 3, 4>,
-            4,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
-            9,
-            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
-                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, H2, W0, W1, W2
-            1,
-            CThreadTransferDstScalarPerVector_K,
-            decltype(a_e0_e1_k_e2_global_step_hacks),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
-        const auto a_e0_e1_k0_k1_e2_grid_desc =
-            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
-            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
-        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
-        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
-        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-        const bool has_main_e0_block_loop = E0 > 1;
-        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
-        const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
-            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
-        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-            decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        float ave_time = 0;
-        if(has_main_e0_block_loop)
-        {
-            const auto kernel =
-                kernel_gemm_dlops_v3<GridwiseGemm,
-                                     FloatAB,
-                                     FloatC,
-                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                                     true,
-                                     activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        else
-        {
-            const auto kernel =
-                kernel_gemm_dlops_v3<GridwiseGemm,
-                                     FloatAB,
-                                     FloatC,
-                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                                     false,
-                                     activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        return ave_time;
-    }
-};
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
-#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t E1_,
-          ck::index_t E2_,
-          ck::index_t K2_,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t E1PerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-          ck::index_t ABlockTransferSrcScalarPerVector_E2,
-          ck::index_t ABlockTransferDstScalarPerVector_E2,
-          ck::index_t BThreadTransferSrcScalarPerVector_E2,
-          ck::index_t CThreadTransferDstScalarPerVector_K,
-          ck::ActivTypeEnum activ_type>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
-{
-    template <typename... Wei,
-              typename... In,
-              typename... MaxPool,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
-                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
-                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                       const ck::TensorDescriptor<MaxPool...>& max_n_k0_hx_wx_k1_global_desc,
-                       const ConvStrides& conv_strides,
-                       const ConvDilations& conv_dilations,
-                       const InLeftPads& in_left_pads,
-                       const InRightPads& in_right_pads,
-                       const FloatAB* __restrict__ p_a_grid,
-                       const FloatAB* __restrict__ p_b_grid,
-                       const FloatC* __restrict__ p_bias_grid,
-                       FloatC* __restrict__ p_c_grid,
-                       FloatC* __restrict__ p_d_grid,
-                       const int nrepeat) const
-    {
-        using namespace ck;
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
-        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
-        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
-        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
-        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-        const auto Hx = max_n_k0_hx_wx_k1_global_desc.GetLength(I2);
-        const auto Wx = max_n_k0_hx_wx_k1_global_desc.GetLength(I3);
-        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
-        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
-        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
-        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
-        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto OutRightPadHx = Number<OutRightPadH / 2>{};
-        const auto OutRightPadWx = Number<OutRightPadW / 2>{};
-#else
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-        const auto OutRightPadHx = OutRightPadH / 2;
-        const auto OutRightPadWx = OutRightPadW / 2;
-#endif
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-        const auto E = C0 * Y * X;
-        constexpr auto E1 = Number<E1_>{};
-        constexpr auto E2 = Number<E2_>{};
-        constexpr auto K2 = Number<K2_>{};
-        const auto E0 = E / E1;
-        // weight tensor
-        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
-            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
-        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_e2_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                                                   make_pass_through_transform(K),
-                                                   make_pass_through_transform(E2)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
-        // input tensor
-        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C0),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
-            in_n_c0_hip_wip_e2_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C0),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
-        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_n_c0_y_ho_x_wo_e2_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(
-                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            in_e_n_ho_wo_e2_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop),
-                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
-        // output tensor
-        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, I0, OutRightPadH),
-                       make_pad_transform(Wo, I0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        // max tensor
-        const auto d_k_n_hx_wx_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Hx, I0, OutRightPadHx),
-                       make_pad_transform(Wx, I0, OutRightPadWx)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E1 % E1PerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-        // clang-format off
-        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
-        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
-            make_tuple(
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
-                make_tuple(
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
-                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
-            ); 
-        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
-        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-        // clang-format on
-        // GEMM
-        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum::Set,
-            decltype(a_e0_e1_k_e2_grid_desc),
-            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
-            decltype(c_k_n_hop_wop_grid_desc),
-            decltype(d_k_n_hx_wx_grid_desc),
-            E1,
-            E2,
-            K2,
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            E1PerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
-            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
-            Sequence<2, 3, 0, 1, 4>,
-            Sequence<0, 1, 2, 3, 4>,
-            4,
-            ABlockTransferSrcScalarPerVector_E2,
-            ABlockTransferDstScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
-            9,
-            BThreadTransferSrcScalarPerVector_E2,
-            false, // don't move back src coordinate after threadwise copy, which will be fused
-                   // with MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
-            1,
-            CThreadTransferDstScalarPerVector_K,
-            decltype(a_e0_e1_k_e2_global_step_hacks),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
-            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
-            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks),
-            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
-        const auto a_e0_e1_k0_k1_e2_grid_desc =
-            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
-        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
-            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
-        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
-            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
-        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
-            GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
-        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
-        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
-            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
-        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
-        using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
-        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-        const bool has_main_e0_block_loop = E0 > 1;
-        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
-        const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
-            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
-        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
-            decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
-        float ave_time = 0;
-        if(has_main_e0_block_loop)
-        {
-            const auto kernel = kernel_gemm_dlops_v3_maxpool<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                true,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        else
-        {
-            const auto kernel = kernel_gemm_dlops_v3_maxpool<
-                GridwiseGemm,
-                FloatAB,
-                FloatC,
-                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
-                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
-                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
-                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
-                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
-                false,
-                activ_type>;
-            ave_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              p_a_grid,
-                                              p_b_grid,
-                                              p_bias_grid,
-                                              p_c_grid,
-                                              p_d_grid,
-                                              a_e0_e1_k0_k1_e2_grid_desc,
-                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
-                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
-                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
-                                              cblockid_to_k_n_h_w_block_cluster_adaptor);
-        }
-        return ave_time;
-    }
-};
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r2.hpp
-#ifndef DRIVER_GEMM_DLOPS_V1R2
-#define DRIVER_GEMM_DLOPS_V1R2
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v1r2.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AKMGridDesc,
-          typename BKNGridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          ck::index_t M1N1ThreadClusterM10,
-          ck::index_t M1N1ThreadClusterN10,
-          ck::index_t M1N1ThreadClusterM11,
-          ck::index_t M1N1ThreadClusterN11,
-          typename ABlockTransferThreadSliceLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterLengths_K_M0_M1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_M1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterLengths_K_N0_N1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_N1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AKMGridDesc& a_k_m_grid_desc,
-                                      const BKNGridDesc& b_k_n_grid_desc,
-                                      const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridStepHacks,
-                                      BGridStepHacks,
-                                      CGridStepHacks,
-                                      AGridMoveSliceWindowStepHacks,
-                                      BGridMoveSliceWindowStepHacks,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    // GEMM
-    using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
-                                                         FloatAB,
-                                                         FloatAcc,
-                                                         FloatC,
-                                                         CGlobalMemoryDataOperation,
-                                                         AKMGridDesc,
-                                                         BKNGridDesc,
-                                                         CMNGridDesc,
-                                                         MPerBlock,
-                                                         NPerBlock,
-                                                         KPerBlock,
-                                                         M1PerThread,
-                                                         N1PerThread,
-                                                         KPerThread,
-                                                         M1N1ThreadClusterM10,
-                                                         M1N1ThreadClusterN10,
-                                                         M1N1ThreadClusterM11,
-                                                         M1N1ThreadClusterN11,
-                                                         ABlockTransferThreadSliceLengths_K_M0_M1,
-                                                         ABlockTransferThreadClusterLengths_K_M0_M1,
-                                                         ABlockTransferThreadClusterArrangeOrder,
-                                                         ABlockTransferSrcAccessOrder,
-                                                         ABlockTransferSrcVectorDim,
-                                                         ABlockTransferSrcScalarPerVector,
-                                                         ABlockTransferDstScalarPerVector_M1,
-                                                         AThreadTransferSrcResetCoordinateAfterRun,
-                                                         BBlockTransferThreadSliceLengths_K_N0_N1,
-                                                         BBlockTransferThreadClusterLengths_K_N0_N1,
-                                                         BBlockTransferThreadClusterArrangeOrder,
-                                                         BBlockTransferSrcAccessOrder,
-                                                         BBlockTransferSrcVectorDim,
-                                                         BBlockTransferSrcScalarPerVector,
-                                                         BBlockTransferDstScalarPerVector_N1,
-                                                         BThreadTransferSrcResetCoordinateAfterRun,
-                                                         CThreadTransferSrcDstAccessOrder,
-                                                         CThreadTransferSrcDstVectorDim,
-                                                         CThreadTransferDstScalarPerVector,
-                                                         AGridStepHacks,
-                                                         BGridStepHacks,
-                                                         CGridStepHacks,
-                                                         AGridMoveSliceWindowStepHacks,
-                                                         BGridMoveSliceWindowStepHacks>;
-    const auto M = a_k_m_grid_desc.GetLength(I1);
-    const auto N = b_k_n_grid_desc.GetLength(I1);
-    const auto K = a_k_m_grid_desc.GetLength(I0);
-    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
-    }
-    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
-    const auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
-    using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc);
-    using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc);
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-    // cblockid_to_m0_n0_block_cluster_adaptor
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K);
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K);
-    {
-        std::cout << "a_k_m0_m1_grid_desc{" << a_k_m0_m1_grid_desc.GetLength(I0) << ", "
-                  << a_k_m0_m1_grid_desc.GetLength(I1) << ", " << a_k_m0_m1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "b_k_n0_n1_grid_desc{" << b_k_n0_n1_grid_desc.GetLength(I0) << ", "
-                  << b_k_n0_n1_grid_desc.GetLength(I1) << ", " << b_k_n0_n1_grid_desc.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-    float ave_time = 0;
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AKM0M1GridDesc>,
-                                   remove_reference_t<BKN0N1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k_m0_m1_grid_desc,
-                                          b_k_n0_n1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_dlops_v1r3.hpp
-#ifndef DRIVER_GEMM_DLOPS_V1R3
-#define DRIVER_GEMM_DLOPS_V1R3
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v1r3.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AK0MK1GridDesc,
-          typename BK0NK1GridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t M1PerThread,
-          ck::index_t N1PerThread,
-          ck::index_t KPerThread,
-          typename M1N1ThreadClusterM1Xs,
-          typename M1N1ThreadClusterN1Xs,
-          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
-          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
-          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
-                                      const FloatAB* p_b_grid,
-                                      FloatC* p_c_grid,
-                                      const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                      const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                      const CMNGridDesc& c_m_n_grid_desc,
-                                      AGridStepHacks,
-                                      BGridStepHacks,
-                                      CGridStepHacks,
-                                      AGridMoveSliceWindowStepHacks,
-                                      BGridMoveSliceWindowStepHacks,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    // GEMM
-    using GridwiseGemm =
-        GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
-                                        FloatAB,
-                                        FloatAcc,
-                                        FloatC,
-                                        CGlobalMemoryDataOperation,
-                                        AK0MK1GridDesc,
-                                        BK0NK1GridDesc,
-                                        CMNGridDesc,
-                                        MPerBlock,
-                                        NPerBlock,
-                                        KPerBlock,
-                                        M1PerThread,
-                                        N1PerThread,
-                                        KPerThread,
-                                        M1N1ThreadClusterM1Xs,
-                                        M1N1ThreadClusterN1Xs,
-                                        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-                                        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-                                        ABlockTransferThreadClusterArrangeOrder,
-                                        ABlockTransferSrcAccessOrder,
-                                        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-                                        ABlockTransferSrcVectorTensorContiguousDimOrder,
-                                        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-                                        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-                                        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-                                        BBlockTransferThreadClusterArrangeOrder,
-                                        BBlockTransferSrcAccessOrder,
-                                        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-                                        BBlockTransferSrcVectorTensorContiguousDimOrder,
-                                        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-                                        CThreadTransferSrcDstAccessOrder,
-                                        CThreadTransferSrcDstVectorDim,
-                                        CThreadTransferDstScalarPerVector,
-                                        AGridStepHacks,
-                                        BGridStepHacks,
-                                        CGridStepHacks,
-                                        AGridMoveSliceWindowStepHacks,
-                                        BGridMoveSliceWindowStepHacks>;
-    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
-    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
-    const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
-    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
-    {
-        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
-    }
-    const auto a_k0_m0_m1_k1_grid_desc =
-        GridwiseGemm::MakeAK0M0M1K1GridDescriptor(a_k0_m_k1_grid_desc);
-    const auto b_k0_n0_n1_k1_grid_desc =
-        GridwiseGemm::MakeBK0N0N1K1GridDescriptor(b_k0_n_k1_grid_desc);
-    using AK0M0M1K1GridDesc = decltype(a_k0_m0_m1_k1_grid_desc);
-    using BK0N0N1K1GridDesc = decltype(b_k0_n0_n1_k1_grid_desc);
-    // c_m0_m10_m11_n0_n10_n11_grid_desc
-    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
-        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
-    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
-    // cblockid_to_m0_n0_block_cluster_adaptor
-    const auto cblockid_to_m0_n0_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
-    using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
-    const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
-    const bool has_double_tail_k_block_loop = GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
-    {
-        std::cout << "a_k0_m0_m1_k1_grid_desc{" << a_k0_m0_m1_k1_grid_desc.GetLength(I0) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I1) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I2) << ", "
-                  << a_k0_m0_m1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "b_k0_n0_n1_k1_grid_desc{" << b_k0_n0_n1_k1_grid_desc.GetLength(I0) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I1) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I2) << ", "
-                  << b_k0_n0_n1_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "c_m0_m10_m11_n0_n10_n11_grid_desc{ "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I0) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I1) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I2) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I3) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I4) << ", "
-                  << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
-    }
-    float ave_time = 0;
-    if(has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   true,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                   FloatAB,
-                                   FloatC,
-                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                   false,
-                                   false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_k0_m0_m1_k1_grid_desc,
-                                          b_k0_n0_n1_k1_grid_desc,
-                                          c_m0_m10_m11_n0_n10_n11_grid_desc,
-                                          cblockid_to_m0_n0_block_cluster_adaptor);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r3.hpp
-#ifndef DRIVER_GEMM_XDLOPS_V2R3_HPP
-#define DRIVER_GEMM_XDLOPS_V2R3_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_v2r3.hpp"
-#include "element_wise_operation.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename AGridDesc_K0_M_K1,
-          typename BGridDesc_K0_N_K,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t MPerXDL,
-          ck::index_t NPerXDL,
-          ck::index_t K1,
-          ck::index_t MRepeat,
-          ck::index_t NRepeat,
-          typename ABlockTransferThreadSliceLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_K1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks,
-          bool CAccessOrderMRepeatNRepeat,
-          bool ABlockLdsAddExtraM,
-          bool BBlockLdsAddExtraN>
-__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
-                                       const FloatAB* p_b_grid,
-                                       FloatC* p_c_grid,
-                                       const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-                                       const BGridDesc_K0_N_K& b_grid_desc_k0_n_k1,
-                                       const CMNGridDesc& c_grid_desc_m_n,
-                                       ck::index_t M01,
-                                       ck::index_t N01,
-                                       AGridStepHacks,
-                                       BGridStepHacks,
-                                       CGridStepHacks,
-                                       AGridMoveSliceWindowStepHacks,
-                                       BGridMoveSliceWindowStepHacks,
-                                       ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    using ElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
-    using GridwiseGemm =
-        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                FloatAB,
-                                                FloatAcc,
-                                                FloatC,
-                                                CGlobalMemoryDataOperation,
-                                                AGridDesc_K0_M_K1,
-                                                BGridDesc_K0_N_K,
-                                                CMNGridDesc,
-                                                ElementwiseOperation,
-                                                ElementwiseOperation,
-                                                ElementwiseOperation,
-                                                MPerBlock,
-                                                NPerBlock,
-                                                KPerBlock,
-                                                MPerXDL,
-                                                NPerXDL,
-                                                K1,
-                                                MRepeat,
-                                                NRepeat,
-                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ABlockTransferSrcAccessOrder,
-                                                ABlockTransferSrcVectorDim,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_K1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                ABlockLdsAddExtraM,
-                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BBlockTransferSrcAccessOrder,
-                                                BBlockTransferSrcVectorDim,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_K1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                BBlockLdsAddExtraN,
-                                                CThreadTransferSrcDstAccessOrder,
-                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector>;
-    {
-        std::cout << "a_grid_desc_k0_m_k1{" << a_grid_desc_k0_m_k1.GetLength(I0) << ", "
-                  << a_grid_desc_k0_m_k1.GetLength(I1) << ", " << a_grid_desc_k0_m_k1.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "b_grid_desc_k0_n_k1{" << b_grid_desc_k0_n_k1.GetLength(I0) << ", "
-                  << b_grid_desc_k0_n_k1.GetLength(I1) << ", " << b_grid_desc_k0_n_k1.GetLength(I2)
-                  << "}" << std::endl;
-        std::cout << "c_grid_desc_m_n{ " << c_grid_desc_m_n.GetLength(I0) << ", "
-                  << c_grid_desc_m_n.GetLength(I1) << "}" << std::endl;
-    }
-    if(!GridwiseGemm::CheckValidity(
-           a_grid_desc_k0_m_k1, b_grid_desc_k0_n_k1, c_grid_desc_m_n, M01, N01))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
-    }
-    const auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc =
-        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
-    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
-    const auto block_2_ctile_map =
-        GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, M01, N01);
-    using Block2CTileMap = decltype(block_2_ctile_map);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_grid_desc_m_n);
-    const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
-    const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
-    float ave_time = 0;
-    auto element_op_ = ElementwiseOperation{};
-    if(has_main_k0_block_loop)
-    {
-        const auto kernel =
-            kernel_gemm_xdlops_v2r3<GridwiseGemm,
-                                    FloatAB,
-                                    FloatC,
-                                    remove_reference_t<AGridDesc_K0_M_K1>,
-                                    remove_reference_t<BGridDesc_K0_N_K>,
-                                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    remove_reference_t<Block2CTileMap>,
-                                    true>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_k0_m_k1,
-                                          b_grid_desc_k0_n_k1,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          element_op_,
-                                          element_op_,
-                                          element_op_,
-                                          block_2_ctile_map);
-    }
-    else
-    {
-        const auto kernel =
-            kernel_gemm_xdlops_v2r3<GridwiseGemm,
-                                    FloatAB,
-                                    FloatC,
-                                    remove_reference_t<AGridDesc_K0_M_K1>,
-                                    remove_reference_t<BGridDesc_K0_N_K>,
-                                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    ElementwiseOperation,
-                                    remove_reference_t<Block2CTileMap>,
-                                    false>;
-        ave_time = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_grid_desc_k0_m_k1,
-                                          b_grid_desc_k0_n_k1,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          element_op_,
-                                          element_op_,
-                                          element_op_,
-                                          block_2_ctile_map);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
+++ b/library/include/ck/library/obselete_driver_offline/driver_gemm_xdlops_v2r4.hpp
-#ifndef DRIVER_GEMM_XDLOPS_V2R4
-#define DRIVER_GEMM_XDLOPS_V2R4
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_xdlops_v2r4.hpp"
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename ABK0MK1GridDesc,
-          typename BBK0NK1GridDesc,
-          typename CMNGridDesc,
-          ck::index_t MPerBlock,
-          ck::index_t NPerBlock,
-          ck::index_t KPerBlock,
-          ck::index_t MPerXDL,
-          ck::index_t NPerXDL,
-          ck::index_t K1,
-          ck::index_t MRepeat,
-          ck::index_t NRepeat,
-          typename ABlockTransferThreadSliceLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterLengths_K0_M_K1,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          ck::index_t ABlockTransferSrcVectorDim,
-          ck::index_t ABlockTransferSrcScalarPerVector,
-          ck::index_t ABlockTransferDstScalarPerVector_K1,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferThreadSliceLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterLengths_K0_N_K1,
-          typename BBlockTransferThreadClusterArrangeOrder,
-          typename BBlockTransferSrcAccessOrder,
-          ck::index_t BBlockTransferSrcVectorDim,
-          ck::index_t BBlockTransferSrcScalarPerVector,
-          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridStepHacks,
-          typename BGridStepHacks,
-          typename CGridStepHacks,
-          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowStepHacks,
-          bool CAccessOrderMRepeatNRepeat,
-          bool ABlockLdsAddExtraM,
-          bool BBlockLdsAddExtraN>
-__host__ float driver_gemm_xdlops_v2r4(const FloatAB* p_a_grid,
-                                       const FloatAB* p_b_grid,
-                                       FloatC* p_c_grid,
-                                       const ABK0MK1GridDesc& a_b_k0_m_k1_grid_desc,
-                                       const BBK0NK1GridDesc& b_b_k0_n_k1_grid_desc,
-                                       const CMNGridDesc& c_m_n_grid_desc,
-                                       ck::index_t M01,
-                                       ck::index_t N01,
-                                       AGridStepHacks,
-                                       BGridStepHacks,
-                                       CGridStepHacks,
-                                       AGridMoveSliceWindowStepHacks,
-                                       BGridMoveSliceWindowStepHacks,
-                                       ck::index_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    using GridwiseGemm =
-        GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4<BlockSize,
-                                                  FloatAB,
-                                                  FloatAcc,
-                                                  FloatC,
-                                                  CGlobalMemoryDataOperation,
-                                                  ABK0MK1GridDesc,
-                                                  BBK0NK1GridDesc,
-                                                  CMNGridDesc,
-                                                  MPerBlock,
-                                                  NPerBlock,
-                                                  KPerBlock,
-                                                  MPerXDL,
-                                                  NPerXDL,
-                                                  K1,
-                                                  MRepeat,
-                                                  NRepeat,
-                                                  ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                  ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                  ABlockTransferThreadClusterArrangeOrder,
-                                                  ABlockTransferSrcAccessOrder,
-                                                  ABlockTransferSrcVectorDim,
-                                                  ABlockTransferSrcScalarPerVector,
-                                                  ABlockTransferDstScalarPerVector_K1,
-                                                  AThreadTransferSrcResetCoordinateAfterRun,
-                                                  BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                  BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                  BBlockTransferThreadClusterArrangeOrder,
-                                                  BBlockTransferSrcAccessOrder,
-                                                  BBlockTransferSrcVectorDim,
-                                                  BBlockTransferSrcScalarPerVector,
-                                                  BBlockTransferDstScalarPerVector_K1,
-                                                  BThreadTransferSrcResetCoordinateAfterRun,
-                                                  CThreadTransferSrcDstAccessOrder,
-                                                  CThreadTransferSrcDstVectorDim,
-                                                  CThreadTransferDstScalarPerVector,
-                                                  AGridStepHacks,
-                                                  BGridStepHacks,
-                                                  CGridStepHacks,
-                                                  AGridMoveSliceWindowStepHacks,
-                                                  BGridMoveSliceWindowStepHacks,
-                                                  CAccessOrderMRepeatNRepeat,
-                                                  ABlockLdsAddExtraM,
-                                                  BBlockLdsAddExtraN>;
-    {
-        std::cout << "a_b_k0_m_k1_grid_desc{" << a_b_k0_m_k1_grid_desc.GetLength(I0) << ", "
-                  << a_b_k0_m_k1_grid_desc.GetLength(I1) << ", "
-                  << a_b_k0_m_k1_grid_desc.GetLength(I2) << ", "
-                  << a_b_k0_m_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "b_b_k0_n_k1_grid_desc{" << b_b_k0_n_k1_grid_desc.GetLength(I0) << ", "
-                  << b_b_k0_n_k1_grid_desc.GetLength(I1) << ", "
-                  << b_b_k0_n_k1_grid_desc.GetLength(I2) << ", "
-                  << b_b_k0_n_k1_grid_desc.GetLength(I3) << "}" << std::endl;
-        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
-                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
-    }
-    if(!GridwiseGemm::CheckValidity(
-           a_b_k0_m_k1_grid_desc, b_b_k0_n_k1_grid_desc, c_m_n_grid_desc, M01, N01))
-    {
-        throw std::runtime_error(
-            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r4 has invalid setting");
-    }
-    const auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc =
-        GridwiseGemm::MakeCM0N0M1N1M2M3M4N2GridDescriptor(c_m_n_grid_desc);
-    using CM0N0M1N1M2M3M4N2GridDesc = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
-    const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
-    const auto c_block_cluster_adaptor =
-        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc, M01, N01, KBatch);
-    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
-    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc, KBatch);
-    {
-        std::cout << "gridSize : " << grid_size << std::endl;
-    }
-    const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
-    const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
-    float ave_time = 0;
-    if(has_main_k0_block_loop)
-    {
-        const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm,
-                                                    FloatAB,
-                                                    FloatC,
-                                                    remove_reference_t<ABK0MK1GridDesc>,
-                                                    remove_reference_t<BBK0NK1GridDesc>,
-                                                    remove_reference_t<CM0N0M1N1M2M3M4N2GridDesc>,
-                                                    remove_reference_t<CBlockClusterAdaptor>,
-                                                    true>;
-        ave_time          = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_b_k0_m_k1_grid_desc,
-                                          b_b_k0_n_k1_grid_desc,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          c_block_cluster_adaptor);
-    }
-    else
-    {
-        const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm,
-                                                    FloatAB,
-                                                    FloatC,
-                                                    remove_reference_t<ABK0MK1GridDesc>,
-                                                    remove_reference_t<BBK0NK1GridDesc>,
-                                                    remove_reference_t<CM0N0M1N1M2M3M4N2GridDesc>,
-                                                    remove_reference_t<CBlockClusterAdaptor>,
-                                                    false>;
-        ave_time          = launch_and_time_kernel(kernel,
-                                          nrepeat,
-                                          dim3(grid_size),
-                                          dim3(BlockSize),
-                                          0,
-                                          p_a_grid,
-                                          p_b_grid,
-                                          p_c_grid,
-                                          a_b_k0_m_k1_grid_desc,
-                                          b_b_k0_n_k1_grid_desc,
-                                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                          c_block_cluster_adaptor);
-    }
-    return ave_time;
-}
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
-#ifndef REFERENCE_BATCHED_GEMM_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_BATCHED_GEMM_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -132,4 +135,3 @@ struct ReferenceBatchedGemm : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
-#ifndef REFERENCE_CONV_BWD_DATA_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_CONV_BWD_DATA_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -106,9 +110,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
                        }
                    }
-                    float v_in;
+                    arg.in_element_op_(v_acc, v_acc);
-                    arg.in_element_op_(v_in, v_acc);
+                    arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_acc);
-                    arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_in);
                };
                make_ParallelTensorFunctor(f_ncw,
@@ -352,4 +355,3 @@ struct ReferenceConvBwdData : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <type_traits>
 #include <sstream>
-#include "stream_config.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
-#include "host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
-#ifndef REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -187,4 +190,3 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
-#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -195,4 +198,3 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
-#ifndef REFERENCE_GEMM_BIAS_BIAS_2D_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_GEMM_BIAS_BIAS_2D_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -66,8 +69,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
                for(int k = 0; k < K; ++k)
                {
-                    arg.a_element_op_(a, arg.a_m_k_(m, k));
+                    arg.a_element_op_(a, ck::type_convert<AccDataType>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(b, arg.b_k_n_(k, n));
+                    arg.b_element_op_(b, ck::type_convert<AccDataType>(arg.b_k_n_(k, n)));
                    acc += a * b;
                }
@@ -131,4 +134,3 @@ struct ReferenceGemmBias2D : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
-#ifndef REFERENCE_GEMM_BIAS_ACTIVATION_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_GEMM_BIAS_ACTIVATION_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -134,4 +138,3 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
-#ifndef REFERENCE_GEMM_BIAS_ACTIVATION_ADD_HPP
+// SPDX-License-Identifier: MIT
-#define REFERENCE_GEMM_BIAS_ACTIVATION_ADD_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -142,4 +146,3 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace host {
+template <typename InDataType, typename OutDataType, typename AccDataType>
+struct ReferenceSoftmax : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in,
+                 Tensor<OutDataType>& out,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const index_t rank,
+                 const std::vector<index_t> sm_reduce_dims)
+            : in_(in), out_(out), alpha_(alpha), beta_(beta), sm_reduce_dims_(sm_reduce_dims)
+        {
+            // std::cout << "debug: scalar dims: ";
+            for(int i = 0; i < rank; i++)
+            {
+                if(std::find(sm_reduce_dims.begin(), sm_reduce_dims.end(), i) ==
+                   sm_reduce_dims.end())
+                {
+                    sm_scalar_dims_.push_back(i);
+                    // std::cout << i << ", ";
+                }
+            }
+            // std::cout << std::endl;
+        }
+        const Tensor<InDataType>& in_;
+        Tensor<OutDataType>& out_;
+        AccDataType alpha_;
+        AccDataType beta_;
+        index_t rank_;
+        std::vector<index_t> sm_reduce_dims_;
+        std::vector<index_t> sm_scalar_dims_; // dim after internal max/sum reduction
+    };
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            std::vector<size_t> scalar_lengths;
+            for(index_t dim : arg.sm_scalar_dims_)
+            {
+                scalar_lengths.push_back(arg.in_.mDesc.GetLengths()[dim]);
+            }
+            Tensor<AccDataType> reduce_max(scalar_lengths);
+            reduce_max.GenerateTensorValue(
+                GeneratorTensor_1<AccDataType>{std::numeric_limits<AccDataType>::lowest()});
+            Tensor<AccDataType> reduce_sum(scalar_lengths);
+            reduce_sum.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0});
+            auto to_sm_scalar_idx = [&](auto idx) {
+                std::vector<size_t> sm_scalar_idx;
+                for(index_t dim : arg.sm_scalar_dims_)
+                {
+                    sm_scalar_idx.push_back(idx[dim]);
+                }
+                return sm_scalar_idx;
+            };
+            arg.in_.ForEach([&](auto& self, auto idx) {
+                reduce_max(to_sm_scalar_idx(idx)) = std::max(reduce_max(to_sm_scalar_idx(idx)),
+                                                             static_cast<AccDataType>(self(idx)));
+            });
+            // LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") <<
+            // std::endl;
+            Tensor<AccDataType> in_stable(arg.in_.mDesc);
+            in_stable.ForEach([&](auto& self, auto idx) {
+                // numerator = exp(x - max(x))
+                self(idx) = std::exp(static_cast<AccDataType>(arg.in_(idx)) -
+                                     reduce_max(to_sm_scalar_idx(idx)));
+            });
+            // LogRangeAsType<float>(std::cout << "in_stable: ", in_stable.mData, ",") << std::endl;
+            in_stable.ForEach([&](auto& self, auto idx) {
+                // denominator = sum(exp(x - max(x)))
+                reduce_sum(to_sm_scalar_idx(idx)) += self(idx);
+            });
+            // LogRangeAsType<float>(std::cout << "reduce_sum: ", reduce_sum.mData, ",") <<
+            // std::endl;
+            arg.out_.ForEach([&](auto& self, auto idx) {
+                self(idx) = arg.alpha_ * in_stable(idx) / reduce_sum(to_sm_scalar_idx(idx)) +
+                            arg.beta_ * self(idx);
+            });
+            // LogRangeAsType<float>(std::cout << "out: ", arg.out_.mData, ",") << std::endl;
+            // reduction along reduce dims
+            // LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") <<
+            // std::endl; LogRangeAsType<float>(std::cout << "reduce_sum: ", reduce_sum.mData, ",")
+            // << std::endl;
+            return 0;
+        }
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+    static auto MakeArgument(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             AccDataType alpha,
+                             AccDataType beta,
+                             const index_t rank,
+                             const std::vector<index_t> sm_reduce_dims)
+    {
+        return Argument{in, out, alpha, beta, rank, sm_reduce_dims};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "ReferenceSoftmax"
+            << std::endl;
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef NAIVE_CONV_FWD_HPP
 #define NAIVE_CONV_FWD_HPP

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
-#ifndef CK_DEVICE_OPERATION_INSTANCE_HPP
+// SPDX-License-Identifier: MIT
-#define CK_DEVICE_OPERATION_INSTANCE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <stdlib.h>
+#pragma once
+#include <vector>
 namespace ck {
 namespace tensor_operation {
@@ -23,4 +25,3 @@ void add_device_operation_instances(std::vector<std::unique_ptr<OpInstance>>& op
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif