Composable kernel init integration v3 (#1097)

* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>

Composable kernel init integration v3 (#1097)
* Squashed 'src/composable_kernel/' content from commit f6edda61 git-subtree-dir: src/composable_kernel git-subtree-split: f6edda61 * add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files * Squashed 'src/composable_kernel/' changes from f6edda61..5781adf5 5781adf5 Update develop (#5) (#6) 97e6d514 Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile 7b1ec41e refactor 49c33aae refactor 54b3e73d rename git-subtree-dir: src/composable_kernel git-subtree-split: 5781adf5 * fix * refactor * remove online compilation from CK * refactor * fix * add ctest * add c-style pointer cast * vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast * fix clang warning suppression * tidy * suppress cppcheck * fix enum issue * revert chagnes to hip build * fix kernel filename * update CK build script * rename * rename * make innner product compatiable on gfx900 * Update src/include/miopen/solver/ck_utility_common.hpp Co-authored-by: JD <Jehandad.Khan@amd.com> * compiler parameter use stream * use int instead of index_t in kernel wrapper * DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element * refactor * refactor * change cmakelist * change ck common utility * fix Co-authored-by: JD <Jehandad.Khan@amd.com>
6fe3627a · Chao Liu · GitHub · 6fe3627a · 6fe3627a · 6fe3627a
Commit 6fe3627a authored Aug 19, 2021 by Chao Liu Committed by GitHub Aug 19, 2021
20 changed files
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+using namespace ck;
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+    using AGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto in_n_c_hi_wi_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+    using AGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+    using BGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+    __shared__ FloatAB p_shared_block[shared_block_size];
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+using namespace ck;
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
+constexpr index_t MPerWave = CK_PARAM_MPerWave;
+constexpr index_t NPerWave = CK_PARAM_NPerWave;
+constexpr index_t MRepeat  = CK_PARAM_MRepeat;
+constexpr index_t NRepeat  = CK_PARAM_NRepeat;
+constexpr index_t K1       = CK_PARAM_K1;
+using ABlockTransferThreadSliceLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
+using ABlockTransferThreadClusterLengths_K0_M_K1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_K1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+using BBlockTransferThreadSliceLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
+using BBlockTransferThreadClusterLengths_K0_N_K1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_K1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_K1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
+    int n,
+    int hi,
+    int wi,
+    int c,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k0_m_k1_grid_desc,
+    void* p_b_k0_n_k1_grid_desc,
+    void* p_c_m0_m1_m2_n_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c));
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c));
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k));
+    const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+        in_n_hi_wi_c_desc,
+        wei_k_y_x_c_desc,
+        out_n_ho_wo_k_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW),
+        Number<K1>{});
+    const auto a_k0_m_k1_grid_desc = descs[I0];
+    const auto b_k0_n_k1_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc     = descs[I2];
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+    auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
+            a_k0_m_k1_grid_desc;
+        *static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
+            b_k0_n_k1_grid_desc;
+        *static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
+            c_m0_m1_m2_n_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+};
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_a_k0_m_k1_grid_desc,
+            const void CONSTANT* p_b_k0_n_k1_grid_desc,
+            const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
+            const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto in_n_hi_wi_c_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
+    constexpr auto wei_k_y_x_c_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256));
+    constexpr auto out_n_ho_wo_k_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          make_tuple(1, 1),
+                                                                          Number<K1>{});
+    constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
+    constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
+    constexpr auto c_m_n_grid_desc         = descs[I2];
+    using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
+    using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
+    using CMNGridDesc    = decltype(c_m_n_grid_desc);
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+    using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 1, 0, 0>{}),
+                                               make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 0, 0, 0>{},
+                                                          Sequence<0, 0, 2, 0, 0>{})));
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+    using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                FloatAB,
+                                                FloatAcc,
+                                                FloatC,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AK0MK1GridDesc,
+                                                BK0NK1GridDesc,
+                                                CMNGridDesc,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWave,
+                                                NPerWave,
+                                                K1,
+                                                MRepeat,
+                                                NRepeat,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                AGridStepHacks,
+                                                BGridStepHacks,
+                                                CGridStepHacks,
+                                                AGridMoveSliceWindowStepHacks,
+                                                BGridMoveSliceWindowStepHacks,
+                                                false>;
+    constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+    const auto a_k0_m_k1_grid_desc =
+        *reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
+    const auto b_k0_n_k1_grid_desc =
+        *reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
+    const auto c_m0_m1_m2_n_grid_desc =
+        *reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+    __shared__ FloatAB p_shared_block[shared_block_size];
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor);
+};
--- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+using namespace ck;
+constexpr DataTypeEnum_t ABDataTypeEnum  = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
+constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
+constexpr DataTypeEnum_t CDataTypeEnum   = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
+using FloatAB  = typename get_datatype_from_enum<ABDataTypeEnum>::type;
+using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
+using FloatC   = typename get_datatype_from_enum<CDataTypeEnum>::type;
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+constexpr auto GN0 = Number<CK_PARAM_GN0>{};
+constexpr auto GK1 = Number<CK_PARAM_GK1>{};
+constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11;
+constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11;
+constexpr index_t GK0PerBlock     = CK_PARAM_GK0PerBlock;
+constexpr index_t BM1PerThreadBM11 = CK_PARAM_BM1PerThreadBM11;
+constexpr index_t BN1PerThreadBN11 = CK_PARAM_BN1PerThreadBN11;
+constexpr index_t BK0PerThread     = CK_PARAM_BK0PerThread;
+using BM10BN10ThreadClusterBM10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBM10Xs>;
+using BM10BN10ThreadClusterBN10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBN10Xs>;
+using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 2, 3, 0, 4>;
+using ABlockTransferSrcAccessOrder            = Sequence<3, 2, 1, 0, 4>;
+using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+    Sequence<CK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
+using ABlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
+using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferThreadClusterArrangeOrder = Sequence<0, 4, 1, 2, 3>;
+using BBlockTransferSrcAccessOrder            = Sequence<4, 3, 2, 0, 1>;
+using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+    Sequence<CK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
+using BBlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
+using CThreadTransferSrcDstAccessOrder              = Sequence<3, 4, 5, 0, 1, 2>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = 5;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HasMainKBlockLoop);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
+extern "C" __global__ void
+convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_,
+                                                                    int C_,
+                                                                    int Hi_,
+                                                                    int Wi_,
+                                                                    int K_,
+                                                                    int Y_,
+                                                                    int X_,
+                                                                    int ConvStrideH_,
+                                                                    int ConvStrideW_,
+                                                                    int ConvDilationH_,
+                                                                    int ConvDilationW_,
+                                                                    int InLeftPadH_,
+                                                                    int InLeftPadW_,
+                                                                    int InRightPadH_,
+                                                                    int InRightPadW_,
+                                                                    void* p_desc_tuple)
+{
+    index_t N             = static_cast<index_t>(N_);
+    index_t C             = static_cast<index_t>(C_);
+    index_t Hi            = static_cast<index_t>(Hi_);
+    index_t Wi            = static_cast<index_t>(Wi_);
+    index_t K             = static_cast<index_t>(K_);
+    index_t Y             = static_cast<index_t>(Y_);
+    index_t X             = static_cast<index_t>(X_);
+    index_t ConvStrideH   = static_cast<index_t>(ConvStrideH_);
+    index_t ConvStrideW   = static_cast<index_t>(ConvStrideW_);
+    index_t ConvDilationH = static_cast<index_t>(ConvDilationH_);
+    index_t ConvDilationW = static_cast<index_t>(ConvDilationW_);
+    index_t InLeftPadH    = static_cast<index_t>(InLeftPadH_);
+    index_t InLeftPadW    = static_cast<index_t>(InLeftPadW_);
+    index_t InRightPadH   = static_cast<index_t>(InRightPadH_);
+    index_t InRightPadW   = static_cast<index_t>(InRightPadW_);
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    const index_t Ho =
+        (Hi + InLeftPadH + InRightPadH - ConvDilationH * (Y - 1) - 1) / ConvStrideH + 1;
+    const index_t Wo =
+        (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi));
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X));
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
+    const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(ConvStrideH, ConvStrideW),
+        make_tuple(ConvDilationH, ConvDilationW),
+        make_tuple(InLeftPadH, InLeftPadW),
+        make_tuple(InRightPadH, InRightPadW),
+        GN0,
+        GK1);
+    const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
+    const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
+    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
+    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+    using CGridStepHacks = decltype(make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
+    using GridwiseContraction =
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
+    if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+    {
+        auto desc_tuple =
+            make_tuple(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+                           a_grid_desc_gk0_gm0_gm1_gk1),
+                       GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+                           b_grid_desc_gk0_gn0_gn1_gk1),
+                       GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+                           c_grid_desc_gm0_gm1_gn0_gn1),
+                       GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+                           c_grid_desc_gm0_gm1_gn0_gn1));
+        *static_cast<decltype(desc_tuple)*>(p_desc_tuple) = desc_tuple;
+    }
+};
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void CONSTANT* p_desc_tuple)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto in_n_c_hi_wi_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto wei_k_c_y_x_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
+    constexpr auto descs =
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                               in_n_c_hi_wi_desc,
+                                                                               out_n_k_ho_wo_desc,
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               make_tuple(1, 1),
+                                                                               GN0,
+                                                                               GK1);
+    constexpr auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    constexpr auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
+    constexpr auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+    using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
+    using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
+    using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
+    using AGridStepHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1+: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2+: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3+: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 1-: GM0
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 2-: GM10
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{},    // 3-: GM11
+                                       Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+    using BGridStepHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},    // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},    // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),   // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},    // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
+    using CGridStepHacks = decltype(make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},    // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},    // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},    // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
+    using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowStepHacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
+    using GridwiseContraction =
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
+        decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+            a_grid_desc_gk0_gm0_gm1_gk1));
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
+        decltype(GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+            b_grid_desc_gk0_gn0_gn1_gk1));
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
+        decltype(GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+            c_grid_desc_gm0_gm1_gn0_gn1));
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+            c_grid_desc_gm0_gm1_gn0_gn1));
+    using DescTuple = decltype(make_tuple(AGridDesc_GK0_GM0_GM10_GM11_GK1{},
+                                          BGridDesc_GK0_GN0_GN10_GN11_GK1{},
+                                          CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{},
+                                          CGridBlockCluster_BlockId_To_GM10_GN10{}));
+    const auto desc_tuple = *reinterpret_cast<const DescTuple*>(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+        // TODO: how to cast?
+        (const void*)p_desc_tuple
+#pragma clang diagnostic pop
+    );
+    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1         = desc_tuple[I0];
+    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1         = desc_tuple[I1];
+    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1     = desc_tuple[I2];
+    const auto c_grid_block_cluster_blockid_to_gm10_gn10 = desc_tuple[I3];
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+    __shared__ FloatAB p_shared_block[shared_block_size];
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                             c_grid_block_cluster_blockid_to_gm10_gn10,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
--- a/external/rocm/include/bfloat16_dev.hpp
+++ b/external/rocm/include/bfloat16_dev.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef BFLOAT16_DEVICE_HPP
+#define BFLOAT16_DEVICE_HPP
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __HIP_PLATFORM_HCC__
+#define EXECUTION_SPECIFIER __device__
+#else
+#define EXECUTION_SPECIFIER
+#endif // MIOPEN_BACKEND_HIP
+typedef union
+{
+    uint u32;
+    ushort2 ushortx2;
+// Composable kernels are written in HIP language. The language doesnt support
+// ushort2.hi or ushort2.low.
+#ifdef __HIP_PLATFORM_HCC__
+    ushort ushortvec[2];
+#endif // MIOPEN_BACKEND_HIP
+    float f32;
+} cvt_bf16_fp32_t;
+EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val)
+{
+    cvt_bf16_fp32_t target_val;
+#ifdef __HIP_PLATFORM_HCC__
+    target_val.ushortx2 = make_ushort2(0, src_val);
+#else
+    target_val.ushortx2 = (ushort2)(0, src_val);
+#endif
+    return target_val.f32;
+}
+EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
+{
+    cvt_bf16_fp32_t target_val;
+    target_val.f32 = src_val;
+    // BF16 round and NaN preservation code matches
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+    if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN
+    {
+        // When all of the exponent bits are 1, the value is Inf or NaN.
+        // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+        // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+        // bit being 1. Signaling NaN is indicated by the most significant
+        // mantissa bit being 0 but some other bit(s) being 1. If any of the
+        // lower 16 bits of the mantissa are 1, we set the least significant bit
+        // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+        // the bloat16's mantissa bits are all 0.
+        if((target_val.u32 & 0xffff) != 0)
+        {
+            target_val.u32 |= 0x10000; // Preserve signaling NaN
+        }
+    }
+    else
+    {
+#ifdef MIOPEN_USE_RNE_BFLOAT16
+// When the exponent bits are not all 1s, then the value is zero, normal,
+// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+// This causes the bfloat16's mantissa to be incremented by 1 if the 16
+// least significant bits of the float mantissa are greater than 0x8000,
+// or if they are equal to 0x8000 and the least significant bit of the
+// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+// has the value 0x7f, then incrementing it causes it to become 0x00 and
+// the exponent is incremented by one, which is the next higher FP value
+// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+// incrementing it causes it to become an exponent of 0xFF and a mantissa
+// of 0x00, which is Inf, the next higher value to the unrounded value.
+#ifdef __HIP_PLATFORM_HCC__
+        target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1));
+#else
+        target_val.u32 +=
+            (0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even
+#endif // MIOPEN_BACKEND_HIP
+#endif // MIOPEN_USE_RNE_BFLOAT16
+    }
+#ifdef __HIP_PLATFORM_HCC__
+    return target_val.ushortvec[1];
+#else
+    return target_val.ushortx2.hi;
+#endif // MIOPEN_BACKEND_HIP
+}
+#ifdef __cplusplus
+}
+#endif
+#endif // BFLOAT16_DEVICE_HPP
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
+add_subdirectory(host_tensor)
+add_subdirectory(driver_offline)
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
+include_directories(BEFORE
+    include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/host/solver/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+)
+set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
+set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
+add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
+target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          I0,
+                                                                          I0,
+                                                                          Number<GemmK1>{});
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 7-: N1
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmABlockTransferSrcScalarPerVector_GemmM,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<1, 3, 7, 0, 2, 4, 5, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              out_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+              nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(out_n_ho_wo_k_desc,
+                                                                            wei_k_y_x_c_desc,
+                                                                            in_n_hi_wi_c_desc,
+                                                                            conv_strides,
+                                                                            conv_dilations,
+                                                                            in_left_pads,
+                                                                            in_right_pads,
+                                                                            I0,
+                                                                            I0,
+                                                                            Number<GemmK1>{});
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+    constexpr auto in_m0_m1_m2_n_grid_step_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+#if 0
+            Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
+#else
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+#endif
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(in_m0_m1_m2_n_grid_step_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            true // CAccessOrderMRepeatNRepeat
+            >(static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              out_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              out_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              in_m0_m1_m2_n_grid_step_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+              nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "driver_gemm_dlops_v1r2.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
+#if 1
+    // cdata = 64, BlockSize = 256, 128x128x8
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+    constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
+    constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
+    constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
+    constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
+    using GemmABlockTransferThreadSliceLengths_K_M0_M1   = Sequence<4, 1, 1>;
+    using GemmABlockTransferThreadClusterLengths_K_M0_M1 = Sequence<2, 1, 128>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_K  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_M1 = 1;
+    using GemmBBlockTransferThreadSliceLengths_K_N0_N1   = Sequence<4, 1, 1>;
+    using GemmBBlockTransferThreadClusterLengths_K_N0_N1 = Sequence<2, 1, 128>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_N1 = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_N1 = 1;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 1;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        conv_strides,
+                                                                        conv_dilations,
+                                                                        in_left_pads,
+                                                                        in_right_pads);
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk_gemmm0_gemmn1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+    constexpr auto wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+    constexpr auto in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+    const auto wei_gemmk_gemmm_grid_desc = descs[I0];
+    const auto in_gemmk_gemmn_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc = descs[I2];
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_dlops_v1r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk_gemmm_grid_desc),
+            decltype(in_gemmk_gemmn_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlockM1,
+            GemmNPerBlockN1,
+            GemmKPerBlock,
+            GemmM1PerThreadM111,
+            GemmN1PerThreadN111,
+            GemmKPerThread,
+            GemmM11N11ThreadClusterM1100,
+            GemmM11N11ThreadClusterN1100,
+            GemmM11N11ThreadClusterM1101,
+            GemmM11N11ThreadClusterN1101,
+            GemmABlockTransferThreadSliceLengths_K_M0_M1,
+            GemmABlockTransferThreadClusterLengths_K_M0_M1,
+            Sequence<2, 1, 0>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<2, 1, 0>, // ABlockTransferSrcAccessOrder
+            0,                 // ABlockTransferSrcVectorDim
+            GemmABlockTransferSrcScalarPerVector_K,
+            GemmABlockTransferDstScalarPerVector_M1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_K_N0_N1,
+            GemmBBlockTransferThreadClusterLengths_K_N0_N1,
+            Sequence<0, 1, 2>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<0, 1, 2>, // BBlockTransferSrcAccessOrder
+            2,                 // BBlockTransferSrcVectorDim
+            GemmBBlockTransferSrcScalarPerVector_N1,
+            GemmBBlockTransferDstScalarPerVector_N1,
+            false,                      // don't move back src coordinate after threadwise copy
+            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            GemmCThreadTransferDstScalarPerVector_N11,
+            decltype(wei_gemmk_gemmm0_gemmn1_grid_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks)>(
+            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+            wei_gemmk_gemmm_grid_desc,
+            in_gemmk_gemmn_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk_gemmm0_gemmn1_grid_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            wei_gemmk_gemmm0_gemmm1_grid_move_slice_window_step_hacks,
+            in_gemmk_gemmn0_gemmn1_grid_move_slice_window_step_hacks,
+            nrepeat);
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
+#if 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#endif
+    const auto descs =
+#if 1
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad
+#else
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_1x1
+#endif
+        <TInWei, GemmMPerBlock, GemmNPerBlock, GemmMPerWave, GemmNPerWave, GemmKPack>(
+            wei_k_c_y_x_desc,
+            in_n_c_hi_wi_desc,
+            out_n_k_ho_wo_desc,
+            conv_strides,
+            conv_dilations,
+            in_left_pads,
+            in_right_pads);
+    for(index_t i = 0; i < 5; ++i)
+    {
+#if 0
+        float ave_time = launch_kernel_gemm_xdlops_v1
+#else
+        float ave_time = launch_kernel_gemm_xdlops_v2
+#endif
+        <BlockSize,
+         TInWei,
+         TAcc,
+         TOut,
+         InMemoryDataOperationEnum_t::Set,
+         decltype(descs[I0]),
+         decltype(descs[I1]),
+         decltype(descs[I2]),
+         decltype(descs[I3]),
+         GemmMPerBlock,
+         GemmNPerBlock,
+         GemmKPerBlock,
+         GemmMPerWave,
+         GemmNPerWave,
+         GemmKPack,
+         MRepeat,
+         NRepeat,
+         GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+         GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+         Sequence<1, 0, 2>,
+         Sequence<1, 0, 2>,
+         2,
+         GemmABlockTransferSrcScalarPerVector_GemmK,
+         GemmABlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy
+         GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+         GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+         Sequence<0, 2, 1>,
+         Sequence<1, 0, 2>,
+         1,
+         GemmBBlockTransferSrcScalarPerVector_GemmN,
+         GemmBBlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy, which will be fused
+                // with MoveSrcSliceWindow() to save addr computation
+         Sequence<2, 3, 0, 1>,
+         3,
+         GemmCThreadTransferDstScalarPerVector_GemmN1,
+         decltype(descs[I4]),
+         decltype(descs[I5]),
+         decltype(descs[I6]),
+         decltype(descs[I7]),
+         decltype(descs[I8])>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                              static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                              static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                              descs[I0],
+                              descs[I1],
+                              descs[I2],
+                              descs[I3],
+                              descs[I4],
+                              descs[I5],
+                              descs[I6],
+                              descs[I7],
+                              descs[I8],
+                              nrepeat);
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_dlops_v1r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 0
+    // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 1;
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 1>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 1>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 1>;
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 1>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 1>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 1>;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 8, 2] for fp16
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 2;
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 2>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 2>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 2>;
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 2>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 2>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 2>;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 8, 4] for i8
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlockM1 = 128;
+    constexpr index_t GemmNPerBlockN1 = 128;
+    constexpr index_t GemmKPerBlock   = 8;
+    constexpr index_t GemmK1          = 4;
+    constexpr index_t GemmM1PerThreadM111 = 4;
+    constexpr index_t GemmN1PerThreadN111 = 4;
+    constexpr index_t GemmKPerThread      = 1;
+    using GemmM11N11ThreadClusterM110Xs = Sequence<8, 2>;
+    using GemmM11N11ThreadClusterN110Xs = Sequence<8, 2>;
+    using GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1   = Sequence<4, 1, 1, 4>;
+    using GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 = Sequence<4, 1, 1, 4>;
+    using GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 = Sequence<1, 1, 1, 4>;
+    using GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1   = Sequence<4, 1, 1, 4>;
+    using GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1 = Sequence<2, 1, 128, 1>;
+    using GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 = Sequence<4, 1, 1, 4>;
+    using GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 = Sequence<1, 1, 1, 4>;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_N11 = 4;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GemmM0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GemmM1
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}),  // 3+: GemmK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GemmM0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GemmM1
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: GemmN0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: GemmN1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{}),  // 3+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: GemmN0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: GemmN1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0>{})); // 3-: GemmK1
+    constexpr auto out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmM0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM10
+                              Sequence<0, 0, 0, 0, 0>{},   // 2+: GemmM11
+                              Sequence<0, 0, 0, 0, 0>{},   // 3+: GemmN0
+                              Sequence<0, 0, 0, 0, 0>{},   // 4+: GemmN10
+                              Sequence<0, 0, 0, 0, 0>{}),  // 5+: GemmN11
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmM0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmM10
+                              Sequence<0, 0, 0, 0, 0>{},   // 2-: GemmM11
+                              Sequence<0, 0, 0, 0, 0>{},   // 3-: GemmN0
+                              Sequence<0, 0, 0, 0, 0>{},   // 4-: GemmN10
+                              Sequence<0, 0, 0, 0, 0>{})); // 5-: GemmN11
+    constexpr auto in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0>{};
+    constexpr auto wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_dlops_v1r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlockM1,
+            GemmNPerBlockN1,
+            GemmKPerBlock,
+            GemmM1PerThreadM111,
+            GemmN1PerThreadN111,
+            GemmKPerThread,
+            GemmM11N11ThreadClusterM110Xs,
+            GemmM11N11ThreadClusterN110Xs,
+            GemmABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            GemmABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            Sequence<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
+            GemmABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+            Sequence<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
+            GemmABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+            GemmBBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            GemmBBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            Sequence<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
+            GemmBBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+            Sequence<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
+            GemmBBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+            Sequence<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            GemmCThreadTransferDstScalarPerVector_N11,
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks),
+            decltype(out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks),
+            decltype(in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks)>(
+            static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+            in_gemmk0_gemmm_gemmk1_grid_desc,
+            wei_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_step_hacks,
+            out_gemmm0_gemmm10_gemmm11_gemmn0_gemmn10_gemmn11_grid_step_hacks,
+            in_gemmk0_gemmm0_gemmm1_gemmk1_grid_move_slice_window_step_hacks,
+            wei_gemmk0_gemmn0_gemmn1_gemmk1_grid_move_slice_window_step_hacks,
+            nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<0, 2, 1>,
+            Sequence<1, 0, 2>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                   static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                   static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                   wei_gemmk0_gemmm_gemmk1_grid_desc,
+                   in_gemmk0_gemmn_gemmk1_grid_desc,
+                   out_gemmm_gemmn_grid_desc,
+                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+                   out_m0_m1_m2_n_grid_step_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+                   nrepeat);
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_xdlops_v2r2.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1>,
+            2,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks)>(
+            static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+            wei_gemmk0_gemmm_gemmk1_grid_desc,
+            in_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+            out_m0_m1_m2_n_grid_step_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+            nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              in_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+              nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+#if 0
+    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
+                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
+                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2+: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3+: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4+: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5+: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6+: M2
+                              Sequence<0, 0, 0, 0, 0>{}),  // 7+: N1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2-: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3-: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4-: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5-: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6-: M2
+                              Sequence<0, 0, 0, 0, 0>{})); // 7-: N1
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              in_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              in_gemmk0_gemmm_gemmk1_grid_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_step_hacks,
+              out_m0_m1_m2_n_grid_step_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+              nrepeat);
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+            float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
+template <typename TInWei,
+          ck::index_t InWeiVectorSize,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t /* nrepeat */)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const auto N = out_n_k_ho_wo_lengths[I0];
+    const auto K = out_n_k_ho_wo_lengths[I1];
+    const auto C = wei_k_c_y_x_lengths[I1];
+    const auto Hi = in_n_c_hi_wi_lengths[I2];
+    const auto Wi = in_n_c_hi_wi_lengths[I3];
+    const auto Ho = out_n_k_ho_wo_lengths[I2];
+    const auto Wo = out_n_k_ho_wo_lengths[I3];
+    const auto Y = wei_k_c_y_x_lengths[I2];
+    const auto X = wei_k_c_y_x_lengths[I3];
+    const auto C0 = C / Number<InWeiVectorSize>{};
+    const auto C1 = Number<InWeiVectorSize>{};
+    const auto K0 = K / Number<InWeiVectorSize>{};
+    const auto K1 = Number<InWeiVectorSize>{};
+    Tensor<TInWei> in_n_c0_hi_wi_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
+    Tensor<TInWei> wei_k_c0_y_x_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
+    Tensor<TOut> out_n_k0_ho_wo_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc  = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+#if 1
+    // cdata = 64, BlockSize = 64, 16x8x32x4
+    constexpr index_t BlockSize = 64;
+    constexpr index_t KPerBlock  = 16;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = 1;
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<3, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+    constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+#else
+    constexpr index_t BlockSize = 64;
+    constexpr index_t KPerBlock  = 16;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = 1;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+#endif
+    constexpr auto conv_driver =
+#if 0
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
+#else
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
+#endif
+        <BlockSize,
+         typename vector_type<TInWei, InWeiVectorSize>::type,
+         TAcc,
+         TOut,
+         KPerBlock,
+         HoPerBlock,
+         WoPerBlock,
+         EPerBlock,
+         KPerThread,
+         HoPerThread,
+         WoPerThread,
+         EPerThread,
+         ABlockTransferThreadSliceLengths_E_K,
+         ABlockTransferThreadClusterLengths_E_K,
+         ABlockTransferSrcScalarPerVector_E,
+         ABlockTransferDstScalarPerVector_K,
+         BThreadTransferSrcScalarPerVector_W,
+         CThreadTransferDstScalarPerVector_W>{};
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_n_k_ho_wo(n, k, ho, wo) =
+            out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
+    };
+    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "driver_contraction_dlops_v1r2.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    const auto in_desc_n_c_hi_wi  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
+#if 1
+    // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GN0 = 4;
+    constexpr index_t GK1 = 1;
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;
+    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
+    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 1>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
+#elif 1
+    // [8, 1, 128, 2] * [8, 4, 32, 2] = [1, 128, 4, 32] for fp16
+    // cdata = 64, BlockSize = 256
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GN0 = 4;
+    constexpr index_t GK1 = 2;
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;
+    using BM10BN10ThreadClusterBM10Xs = Sequence<8, 2>;
+    using BM10BN10ThreadClusterBN10Xs = Sequence<8, 2>;
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 2>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 2>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
+#endif
+    const auto descs =
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_desc_k_c_y_x,
+                                                                               in_desc_n_c_hi_wi,
+                                                                               out_desc_n_k_ho_wo,
+                                                                               conv_strides,
+                                                                               conv_dilations,
+                                                                               in_left_pads,
+                                                                               in_right_pads,
+                                                                               Number<GN0>{},
+                                                                               Number<GK1>{});
+    const auto wei_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto in_grid_desc_gk0_gn0_gn1_gk1  = descs[I1];
+    const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1+: GM0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2+: GM10
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3+: GM11
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 1-: GM0
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 2-: GM10
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{},   // 3-: GM11
+                              Sequence<0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
+    constexpr auto in_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 1+: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 2+: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},   // 3+: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 4+: GK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 1-: GN0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 2-: GN10
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},   // 3-: GN11
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 4-: GK1
+    constexpr auto out_grid_step_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 1+: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},  // 2+: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},  // 4+: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: GM10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 1-: BM0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},   // 2-: BM1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: GN10
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},   // 4-: BN0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})); // 5-: GN1
+    constexpr auto wei_grid_move_slice_window_step_hacks = Sequence<0, 0, 0, 0, 0, 0, 0>{};
+    constexpr auto in_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>{};
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_contraction_dlops_v1r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
+            decltype(in_grid_desc_gk0_gn0_gn1_gk1),
+            decltype(out_grid_desc_gm0_gm1_gn0_gn1),
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            Sequence<1, 2, 3, 0, 4>, // ABlockTransferThreadClusterArrangeOrder
+            Sequence<3, 2, 1, 0, 4>, // ABlockTransferSrcAccessOrder
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            Sequence<0, 1, 2, 3, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            Sequence<0, 4, 1, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
+            Sequence<4, 3, 2, 0, 1>, // BBlockTransferSrcAccessOrder
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            Sequence<0, 1, 2, 3, 4>,    // BBlockTransferSrcVectorTensorContiguousDimOrder
+            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
+            5,                          // CThreadTransferSrcDstVectorDim
+            CThreadTransferDstScalarPerVector_BN1,
+            decltype(wei_grid_step_hacks),
+            decltype(in_grid_step_hacks),
+            decltype(out_grid_step_hacks),
+            decltype(wei_grid_move_slice_window_step_hacks),
+            decltype(in_grid_move_slice_window_step_hacks)>(
+            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+            wei_grid_desc_gk0_gm0_gm1_gk1,
+            in_grid_desc_gk0_gn0_gn1_gk1,
+            out_grid_desc_gm0_gm1_gn0_gn1,
+            wei_grid_step_hacks,
+            in_grid_step_hacks,
+            out_grid_step_hacks,
+            wei_grid_move_slice_window_step_hacks,
+            in_grid_move_slice_window_step_hacks,
+            nrepeat);
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
+++ b/host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
+#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP
+#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_contraction_dlops_v1r2.hpp"
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGridDesc_GK0_GM0_GM1_GK1,
+          typename BGridDesc_GK0_GN0_GN1_GK1,
+          typename CGridDesc_GM0_GM1_GN0_GN1,
+          ck::index_t GM1PerBlockGM11,
+          ck::index_t GN1PerBlockGN11,
+          ck::index_t GK0PerBlock,
+          ck::index_t BM1PerThreadBM11,
+          ck::index_t BN1PerThreadBN11,
+          ck::index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs,
+          typename BM10BN10ThreadClusterBN10Xs,
+          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename CThreadTransferSrcDstAccessOrder,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+__host__ float
+driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
+                              const FloatAB* p_b_grid,
+                              FloatC* p_c_grid,
+                              const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                              const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                              const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1,
+                              AGridStepHacks,
+                              BGridStepHacks,
+                              CGridStepHacks,
+                              AGridMoveSliceWindowStepHacks,
+                              BGridMoveSliceWindowStepHacks,
+                              ck::index_t nrepeat)
+{
+    using namespace ck;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    // GEMM
+    using GridwiseContraction =
+        GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            CGlobalMemoryDataOperation,
+            AGridDesc_GK0_GM0_GM1_GK1,
+            BGridDesc_GK0_GN0_GN1_GK1,
+            CGridDesc_GM0_GM1_GN0_GN1,
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM10Xs,
+            BM10BN10ThreadClusterBN10Xs,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorContiguousDimOrder,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorContiguousDimOrder,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            AGridStepHacks,
+            BGridStepHacks,
+            CGridStepHacks,
+            AGridMoveSliceWindowStepHacks,
+            BGridMoveSliceWindowStepHacks>;
+    const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+    if(!GridwiseContraction::CheckValidity(
+           a_grid_desc_gk0_gm0_gm1_gk1, b_grid_desc_gk0_gn0_gn1_gk1, c_grid_desc_gm0_gm1_gn0_gn1))
+    {
+        throw std::runtime_error("wrong! "
+                                 "GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
+                                 "GM0_GM1_GN0_GN1 has invalid setting");
+    }
+    const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 =
+        GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(a_grid_desc_gk0_gm0_gm1_gk1);
+    const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 =
+        GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(b_grid_desc_gk0_gn0_gn1_gk1);
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 = decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1);
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 = decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1);
+    // c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
+    const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
+        GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+            c_grid_desc_gm0_gm1_gn0_gn1);
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 = decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1);
+    // c_grid_block_cluster_blockid_to_gm10_gn10
+    const auto c_grid_block_cluster_blockid_to_gm10_gn10 =
+        GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+            c_grid_desc_gm0_gm1_gn0_gn1);
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(c_grid_block_cluster_blockid_to_gm10_gn10);
+    const index_t grid_size = GridwiseContraction::CalculateGridSize(c_grid_desc_gm0_gm1_gn0_gn1);
+    const bool has_main_k_block_loop = GridwiseContraction::CalculateHasMainKBlockLoop(GK0);
+    const bool has_double_tail_k_block_loop =
+        GridwiseContraction::CalculateHasDoubleTailKBlockLoop(GK0);
+    {
+        std::cout << "a_grid_desc_gk0_gm0_gm10_gm11_gk1{"
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I1) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I2) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I3) << ", "
+                  << a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I4) << "}" << std::endl;
+        std::cout << "b_grid_desc_gk0_gn0_gn10_gn11_gk1{"
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I0) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I1) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I2) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I3) << ", "
+                  << b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetLength(I4) << "}" << std::endl;
+        std::cout << "c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1{ "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I0) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I1) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I2) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I3) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I4) << ", "
+                  << c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetLength(I5) << "}" << std::endl;
+    }
+    float ave_time = 0;
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            true,
+            true>;
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            true,
+            false>;
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            false,
+            true>;
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    else
+    {
+        const auto kernel = kernel_contraction_dlops_v1r2<
+            GridwiseContraction,
+            FloatAB,
+            FloatC,
+            remove_reference_t<AGridDesc_GK0_GM0_GM10_GM11_GK1>,
+            remove_reference_t<BGridDesc_GK0_GN0_GN10_GN11_GK1>,
+            remove_reference_t<CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1>,
+            remove_reference_t<CGridBlockCluster_BlockId_To_GM10_GN10>,
+            false,
+            false>;
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          p_a_grid,
+                                          p_b_grid,
+                                          p_c_grid,
+                                          a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                          b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                          c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                                          c_grid_block_cluster_blockid_to_gm10_gn10);
+    }
+    return ave_time;
+}
+#endif
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t EPerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          ck::index_t ABlockTransferSrcScalarPerVector_E,
+          ck::index_t ABlockTransferDstScalarPerVector_K,
+          ck::index_t BThreadTransferSrcScalarPerVector_W,
+          ck::index_t CThreadTransferDstScalarPerVector_W>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        using namespace ck;
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+        const auto InRightPadH = in_right_pads[I0];
+        const auto InRightPadW = in_right_pads[I1];
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        // output tensor
+        const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const auto E = C * Y * X;
+        if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+#if 1
+        // GEMM
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_ho_wo_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
+        const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
+        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+        index_t nrepeat = 100;
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               decltype(wei_e_k_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(in_e_n_ho_wo_global_desc),
+                                                               const FloatAB*,
+                                                               decltype(out_k_n_ho_wo_global_desc),
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_ho_wo_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+            timer.End();
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#endif
+    }
+};
+#endif
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t EPerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          ck::index_t ABlockTransferSrcScalarPerVector_E,
+          ck::index_t ABlockTransferDstScalarPerVector_K,
+          ck::index_t BThreadTransferSrcScalarPerVector_W,
+          ck::index_t CThreadTransferDstScalarPerVector_W>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        using namespace ck;
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
+        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, 0, OutRightPadH),
+                       make_pad_transform(Wo, 0, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const auto E = C * Y * X;
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+        // GEMM
+        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_step_hacks),
+            decltype(b_e_n_ho_wo_global_step_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
+            decltype(a_e_k_global_move_slice_window_step_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+        index_t nrepeat = 100;
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, true>,
+                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel =
+                        run_gridwise_operation<gridwise_gemm,
+                                               decltype(wei_e_k_global_desc),
+                                               const FloatAB*,
+                                               decltype(in_e_n_ho_wo_global_desc),
+                                               const FloatAB*,
+                                               decltype(out_k_n_hop_wop_global_desc),
+                                               FloatC*,
+                                               integral_constant<bool, false>,
+                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  wei_e_k_global_desc,
+                                  p_wei_global,
+                                  in_e_n_ho_wo_global_desc,
+                                  p_in_global,
+                                  out_k_n_hop_wop_global_desc,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+            timer.End();
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+            float perf =
+                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                               wei_k_c_y_x_global_desc,
+                                                               out_n_k0_ho_wo_k1_global_desc)) /
+                (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+#endif