v5r1 fusion kernels for inference (#49)

* init * refactor for 1x1 * rename e0_e1 * add e1 with bugs * debug * fixed * fixed e1 * add timer * imprve threadwise gemm with dot2 * add e2 * tuning * seperate c2 * add nhwc * restore nchwc * clean * opt * fixed; tuning * add BGlobalMoveSliceWindowStepHacks{} * tuning * repeat running * adjust * merge v5r1 nchwc * add adaptors * split k0 k1 in c_thread_grid * split h and w * remove v5r1 nhwc * clean for pr * remove host_conv_add * clean code * clean * add dynamic support * static mode * test static * add conv+add fusion * fixed validation * naming fix * use activ_enum * make static * refactor conv_add for InMem::add * add bias * add conv_out * add configurable makeddesc * add maxpool fusion * add maxpool host for validation * enable static desc * conv-only use v5r1_add * test * test * for binary dumps * fixed incorrect results due to typo * clean * debugging maxpool * workaround with offset trick * clean code * modularize ops of fusion * add gridwise_gemm_v3 * create seperate fusion fun * enable dynamic mode of conv and conv+resize_add * add dynamic mode of maxpool * add pass by point * add activ_type as arguments * merge develop * clean * reset config to old default Co-authored-by: Chao Liu <chao.liu2@amd.com>

v5r1 fusion kernels for inference (#49)
* init * refactor for 1x1 * rename e0_e1 * add e1 with bugs * debug * fixed * fixed e1 * add timer * imprve threadwise gemm with dot2 * add e2 * tuning * seperate c2 * add nhwc * restore nchwc * clean * opt * fixed; tuning * add BGlobalMoveSliceWindowStepHacks{} * tuning * repeat running * adjust * merge v5r1 nchwc * add adaptors * split k0 k1 in c_thread_grid * split h and w * remove v5r1 nhwc * clean for pr * remove host_conv_add * clean code * clean * add dynamic support * static mode * test static * add conv+add fusion * fixed validation * naming fix * use activ_enum * make static * refactor conv_add for InMem::add * add bias * add conv_out * add configurable makeddesc * add maxpool fusion * add maxpool host for validation * enable static desc * conv-only use v5r1_add * test * test * for binary dumps * fixed incorrect results due to typo * clean * debugging maxpool * workaround with offset trick * clean code * modularize ops of fusion * add gridwise_gemm_v3 * create seperate fusion fun * enable dynamic mode of conv and conv+resize_add * add dynamic mode of maxpool * add pass by point * add activ_type as arguments * merge develop * clean * reset config to old default Co-authored-by: Chao Liu <chao.liu2@amd.com>
970fa3e9 · zjing14 · GitHub · a651ea4f · 970fa3e9 · 970fa3e9
Unverified Commit 970fa3e9 authored Nov 18, 2021 by zjing14 Committed by GitHub Nov 18, 2021
20 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -10,99 +10,99 @@ template <index_t BlockSize,
          typename FloatA,
          typename FloatB,
          typename FloatC,
-          typename BlockMatrixA,
-          typename BlockMatrixB,
-          typename ThreadMatrixC,
-          index_t KPerThread,
-          index_t HPerThread,
-          index_t WPerThread,
+          typename ABlockDesc_E1_K1_E2,
+          typename BBlockDesc_E1_N_Ho_Wo_E2,
+          typename CThreadDesc_K_N_Ho_Wo,
          index_t EPerThreadLoop,
-          index_t ThreadGemmADataPerRead_K,
-          index_t ThreadGemmBDataPerRead_W>
+          index_t KPerThreadLoop>
 struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
 {
-    struct MatrixIndex
-    {
-        index_t k;
-        index_t h;
-        index_t w;
-    };
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto E1        = ABlockDesc_E1_K1_E2{}.GetLength(I0);
+    static constexpr auto KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1);
+    static constexpr auto E2        = ABlockDesc_E1_K1_E2{}.GetLength(I2);

-    // HACK: fix this @Jing Zhang
-    static constexpr index_t KPerThreadSubC = 4;
+    static constexpr auto HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+    static constexpr auto WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
+
+    static constexpr auto KPerThread  = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0);
+    static constexpr auto HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2);
+    static constexpr auto WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3);

    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{}));

-    static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
-        Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+    static constexpr auto b_thread_mtx_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<EPerThreadLoop>{},
+                                                       Number<1>{},
+                                                       Number<HoPerThread>{},
+                                                       Number<WoPerThread>{},
+                                                       Number<E2>{}));

    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
-        Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
-
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatA,
-                                                         BlockMatrixA,
-                                                         decltype(a_thread_mtx_),
-                                                         Sequence<EPerThreadLoop, KPerThreadSubC>,
-                                                         Sequence<0, 1>,
-                                                         1,
-                                                         ThreadGemmADataPerRead_K,
-                                                         1>;
+        Number<KPerThreadLoop>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));

    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
-        : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
-          a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
+        : c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())},
+          a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)}
    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
+        static_assert(ABlockDesc_E1_K1_E2::IsKnownAtCompileTime() &&
+                          BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
                      "wrong! Desc should be known at compile-time");

-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
+        static_assert(
+            ABlockDesc_E1_K1_E2{}.GetLength(I0) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I0) &&
+                ABlockDesc_E1_K1_E2{}.GetLength(I2) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I4),
+            "wrong! E dimension not consistent\n");

-        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
-        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
+        static_assert(E1 % EPerThreadLoop == 0, "");
+        static_assert(KPerThread % KPerThreadLoop == 0, "");

-        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
+        static_assert(KPerBlock % KPerThread == 0 && HoPerBlock % HoPerThread == 0 &&
+                          WoPerBlock % WoPerThread == 0,
                      "wrong! Cannot evenly divide work among\n");

-        constexpr auto KThreadCluster = K / KPerThread;
-        constexpr auto HThreadCluster = H / HPerThread;
-        constexpr auto WThreadCluster = W / WPerThread;
+        constexpr auto KThreadCluster = KPerBlock / KPerThread;
+        constexpr auto HThreadCluster = HoPerBlock / HoPerThread;
+        constexpr auto WThreadCluster = WoPerBlock / WoPerThread;

        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
                      "wrong! wrong blocksize\n");
    }

-    __device__ static constexpr auto GetThreadMatrixCLengths()
+    __device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths()
    {
-        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
+        return Sequence<KPerThread, I1, HoPerThread, WoPerThread>{};
    }

-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    __device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id)
    {
-        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
-        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
-
-        constexpr auto num_w_threads  = W / WPerThread;
-        constexpr auto num_h_threads  = H / HPerThread;
-        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
-
-        index_t k_thread_id  = thread_id / num_hw_threads;
-        index_t hw_thread_id = thread_id % num_hw_threads;
-
-        index_t h_thread_id = hw_thread_id / num_w_threads;
-        index_t w_thread_id = hw_thread_id % num_w_threads;
-
-        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+        constexpr auto K0 = KPerBlock / KPerThread;
+        constexpr auto N0 = I1;
+        constexpr auto H0 = HoPerBlock / HoPerThread;
+        constexpr auto W0 = WoPerBlock / WoPerThread;
+
+        constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
+
+        const auto c_k_n_h_w_thread_cluster_idx =
+            c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(thread_id));
+
+        return c_k_n_h_w_thread_cluster_idx;
    }

    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
@@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
            is_same<remove_cvref_t<typename CThreadBuffer::type>, remove_cvref_t<FloatC>>::value &&
            "wrong! inconsistent type");

-        constexpr auto I0 = Number<0>{};
-
-        constexpr auto a_block_mtx = BlockMatrixA{};
-
-        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
-
-        // HACK: fix this @Jing Zhang
-        constexpr auto HoPerThreadSubC = 2;
-        constexpr auto WoPerThreadSubC = 2;
-
-        static_assert(KPerThread % KPerThreadSubC == 0, "");
-        static_assert(HPerThread % HoPerThreadSubC == 0, "");
-        static_assert(WPerThread % WoPerThreadSubC == 0, "");
+        constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{};

        // thread A buffer for GEMM
        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
@@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                                                                         FloatC,
                                                                         decltype(a_thread_mtx_),
                                                                         decltype(b_thread_mtx_),
-                                                                         decltype(c_thread_mtx_),
-                                                                         HoPerThreadSubC,
-                                                                         WoPerThreadSubC>{};
+                                                                         decltype(c_thread_mtx_)>{};

-        static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
-            static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
+        static_for<0, E1, EPerThreadLoop>{}([&](auto e_begin) {
+            static_for<0, KPerThread, KPerThreadLoop>{}([&](auto k_begin) {
                a_thread_copy_.Run(a_block_mtx,
-                                   make_tuple(e_begin, k_begin),
+                                   make_tuple(e_begin, k_begin, I0),
                                   a_block_buf,
                                   a_thread_mtx_,
-                                   make_tuple(I0, I0),
+                                   make_tuple(I0, I0, I0),
                                   a_thread_buf);

-                static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
-                    static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
                threadwise_gemm.Run(a_thread_buf,
-                                            make_tuple(I0, I0),
+                                    make_tuple(I0, I0, I0),
                                    b_thread_buf,
-                                            make_tuple(e_begin, I0, h_begin, w_begin),
+                                    make_tuple(e_begin, I0, I0, I0, I0),
                                    c_thread_buf,
-                                            make_tuple(k_begin, I0, h_begin, w_begin));
-                    });
-                });
+                                    make_tuple(k_begin, I0, I0, I0));
            });
        });
    }

    template <typename ABlockSliceMoveStepIdx>
-    __device__ void MoveASliceWindow(const BlockMatrixA&,
-                                     const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
+    __device__ void MoveABlockSliceWindow(const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
    {
-        a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx);
+        a_thread_copy_.MoveSrcSliceWindow(ABlockDesc_E1_K1_E2{}, a_block_slice_move_step_idx);
    }

    private:
-    MatrixIndex c_thread_begin_mtx_idx_;
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                         FloatA,
+                                         ABlockDesc_E1_K1_E2,
+                                         decltype(a_thread_mtx_),
+                                         Sequence<EPerThreadLoop, KPerThreadLoop, E2>,
+                                         Sequence<0, 1, 2>,
+                                         2,
+                                         E2,
+                                         E2>;
+
+    CIndex c_thread_origin_data_idx_;

    AThreadCopy a_thread_copy_;
 };

--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
+#ifndef CK_GRIDWISE_GEMM_V3_HPP
+#define CK_GRIDWISE_GEMM_V3_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+#include "blockwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActiv(p_a_grid,
+                                p_b_grid,
+                                p_bias_grid,
+                                p_c_grid,
+                                p_shared_block,
+                                a_e0_e1_k0_k1_e2_grid_desc,
+                                b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                integral_constant<bool, HasMainE0BlockLoop>{},
+                                integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_resize_add(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_d_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
+                                         p_b_grid,
+                                         p_bias_grid,
+                                         p_d_grid,
+                                         p_shared_block,
+                                         a_e0_e1_k0_k1_e2_grid_desc,
+                                         b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                         c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                         d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                         c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                         integral_constant<bool, HasMainE0BlockLoop>{},
+                                         integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_maxpool(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_c_grid,
+            FloatC* __restrict__ p_d_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
+                                       p_b_grid,
+                                       p_bias_grid,
+                                       p_c_grid,
+                                       p_d_grid,
+                                       p_shared_block,
+                                       a_e0_e1_k0_k1_e2_grid_desc,
+                                       b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                       c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                       d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                       c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                       integral_constant<bool, HasMainE0BlockLoop>{},
+                                       integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid,
+                             const FloatAB* __restrict__ p_b_grid,
+                             const FloatC* __restrict__ p_bias_grid,
+                             FloatC* __restrict__ p_c_grid,
+                             const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
+                             const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                             const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                             const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast<const AGridDesc_E0_E1_K0_K1_E2*>(
+        cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc));
+    const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        *reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
+            cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
+    const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+        *reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
+            cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
+    const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
+            cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActiv(p_a_grid,
+                                p_b_grid,
+                                p_bias_grid,
+                                p_c_grid,
+                                p_shared_block,
+                                a_e0_e1_k0_k1_e2_grid_desc,
+                                b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                integral_constant<bool, HasMainE0BlockLoop>{},
+                                integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+// pass tensor descriptor by CONSTANT void pointer
+// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_resize_add(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_d_grid,
+            const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
+            const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast<const AGridDesc_E0_E1_K0_K1_E2*>(
+        cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc));
+    const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        *reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
+            cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
+    const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+        *reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
+            cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
+    const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
+        *reinterpret_cast<const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx*>(
+            cast_pointer_to_generic_address_space(p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc));
+    const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
+            cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
+                                         p_b_grid,
+                                         p_bias_grid,
+                                         p_d_grid,
+                                         p_shared_block,
+                                         a_e0_e1_k0_k1_e2_grid_desc,
+                                         b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                         c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                         d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                         c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                         integral_constant<bool, HasMainE0BlockLoop>{},
+                                         integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_maxpool(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_c_grid,
+            FloatC* __restrict__ p_d_grid,
+            const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
+            const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const void CONSTANT* p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    // first cast void CONSTANT void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_e0_e1_k0_k1_e2_grid_desc = *reinterpret_cast<const AGridDesc_E0_E1_K0_K1_E2*>(
+        cast_pointer_to_generic_address_space(p_a_e0_e1_k0_k1_e2_grid_desc));
+    const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        *reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
+            cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
+    const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+        *reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
+            cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
+    const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
+        *reinterpret_cast<const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx*>(
+            cast_pointer_to_generic_address_space(p_d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc));
+    const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
+            cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
+                                       p_b_grid,
+                                       p_bias_grid,
+                                       p_c_grid,
+                                       p_d_grid,
+                                       p_shared_block,
+                                       a_e0_e1_k0_k1_e2_grid_desc,
+                                       b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                       c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                       d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                       c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                       integral_constant<bool, HasMainE0BlockLoop>{},
+                                       integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_resize_add(const FloatAB* __restrict__ p_a_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
+                                        const FloatC* __restrict__ p_bias_grid,
+                                        FloatC* __restrict__ p_d_grid)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+    constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+    constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
+    constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{};
+    constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+
+    GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
+                                         p_b_grid,
+                                         p_bias_grid,
+                                         p_d_grid,
+                                         p_shared_block,
+                                         a_e0_e1_k0_k1_e2_grid_desc,
+                                         b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                         c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                         d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                         c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                         integral_constant<bool, HasMainE0BlockLoop>{},
+                                         integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_maxpool(const FloatAB* __restrict__ p_a_grid,
+                                     const FloatAB* __restrict__ p_b_grid,
+                                     const FloatC* __restrict__ p_bias_grid,
+                                     FloatC* __restrict__ p_c_grid,
+                                     FloatC* __restrict__ p_d_grid)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+    constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+    constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
+    constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{};
+    constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+
+    GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
+                                       p_b_grid,
+                                       p_bias_grid,
+                                       p_c_grid,
+                                       p_d_grid,
+                                       p_shared_block,
+                                       a_e0_e1_k0_k1_e2_grid_desc,
+                                       b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                       c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                       d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                       c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                       integral_constant<bool, HasMainE0BlockLoop>{},
+                                       integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum_t ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3(const FloatAB* __restrict__ p_a_grid,
+                             const FloatAB* __restrict__ p_b_grid,
+                             const FloatC* __restrict__ p_bias_grid,
+                             FloatC* __restrict__ p_c_grid)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
+    constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+        BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
+    constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2{};
+    constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+        CBlockIdToBlockClusterAdaptor_K_N_H_W{};
+
+    GridwiseGemm::ConvBiasActiv(p_a_grid,
+                                p_b_grid,
+                                p_bias_grid,
+                                p_c_grid,
+                                p_shared_block,
+                                a_e0_e1_k0_k1_e2_grid_desc,
+                                b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                c_blockid_to_k_n_h_w_block_cluster_adaptor,
+                                integral_constant<bool, HasMainE0BlockLoop>{},
+                                integral_constant<ActivTypeEnum_t, ActivType>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
+          typename AGridDesc_E0_E1_K_E2,
+          typename BGridDesc_E0_E1_N_Ho_Wo_E2,
+          typename CGridDesc_K_N_Ho_Wo,
+          typename DGridDesc_K_N_Hx_Wx,
+          index_t E1_,
+          index_t E2_,
+          index_t K2_,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t E1PerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_E2,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename DGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v3
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto E1 = Number<E1_>{};
+    static constexpr auto E2 = Number<E2_>{};
+    static constexpr auto K2 = Number<K2_>{};
+
+    static constexpr auto NPerBlock = I1;
+
+    static constexpr FloatAcc alpha = 0.3;
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e0_e1_k1_e2_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(I1, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size = math::integer_least_multiple(
+            a_e0_e1_k1_e2_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+        const auto K0 = K / KPerBlock;
+        const auto N0 = N / NPerBlock;
+        const auto H0 = Ho / HoPerBlock;
+        const auto W0 = Wo / WoPerBlock;
+
+        const index_t grid_size = K0 * N0 * H0 * W0;
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainE0BlockLoop(const index_t E0)
+    {
+        const bool has_main_e0_block_loop = E0 > 1;
+
+        return has_main_e0_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
+    {
+        const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1;
+
+        return has_main_e1_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailE1BlockLoop()
+    {
+        const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0;
+
+        return has_double_tail_e1_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAE0E1K0K1E2GridDescriptor(const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_grid_desc)
+    {
+        const auto E0 = a_e0_e1_k_e2_grid_desc.GetLength(I0);
+        const auto K  = a_e0_e1_k_e2_grid_desc.GetLength(I2);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto a_e0_e1_k0_k1_e2_grid_desc = transform_tensor_descriptor(
+            a_e0_e1_k_e2_grid_desc,
+            make_tuple(make_pass_through_transform(E0),
+                       make_pass_through_transform(E1),
+                       make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return a_e0_e1_k0_k1_e2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(
+        const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_grid_desc)
+    {
+        const auto E0 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I0);
+        // const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1);
+        const auto N  = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I2);
+        const auto Ho = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I3);
+        const auto Wo = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I4);
+        // const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5);
+
+        const auto H2 = Number<HoPerThread>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Ho / (H1 * H2);
+
+        const auto W2 = Number<WoPerThread>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Wo / (W1 * W2);
+
+        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+            transform_tensor_descriptor(b_e0_e1_n_ho_wo_e2_grid_desc,
+                                        make_tuple(make_pass_through_transform(E0),
+                                                   make_pass_through_transform(E1),
+                                                   make_pass_through_transform(N),
+                                                   make_unmerge_transform(make_tuple(H0, H1, H2)),
+                                                   make_unmerge_transform(make_tuple(W0, W1, W2)),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3>{},
+                                                   Sequence<4>{},
+                                                   Sequence<5>{}),
+                                        make_tuple(Sequence<0>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3, 4, 5>{},
+                                                   Sequence<6, 7, 8>{},
+                                                   Sequence<9>{}));
+
+        return b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCK0K1NH0H1H2W0W1W2GridDescriptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto H2 = Number<HoPerThread>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Ho / (H1 * H2);
+
+        const auto W2 = Number<WoPerThread>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Wo / (W1 * W2);
+
+        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = transform_tensor_descriptor(
+            c_k_n_ho_wo_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
+    {
+        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
+        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
+        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
+        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto H2 = Number<HoPerThread / 2>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Number<Hx / (H1 * H2)>{};
+
+        const auto W2 = Number<WoPerThread / 2>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Number<Wx / (W1 * W2)>{};
+#else
+        const auto H2 = HoPerThread / 2;
+        const auto H1 = HoPerBlock / HoPerThread;
+        const auto H0 = Hx / (H1 * H2);
+
+        const auto W2 = WoPerThread / 2;
+        const auto W1 = WoPerBlock / WoPerThread;
+        const auto W0 = Wx / (W1 * W2);
+#endif
+
+        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
+            d_k_n_hx_wx_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
+    {
+        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
+        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
+        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
+        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto H2 = Number<HoPerThread * 2>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+
+        const auto W2 = Number<WoPerThread * 2>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto H0 = Number<Hx / (H1 * H2)>{};
+        const auto W0 = Number<Wx / (W1 * W2)>{};
+#else
+        const auto H0 = Hx / (H1 * H2);
+        const auto W0 = Wx / (W1 * W2);
+#endif
+
+        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
+            d_k_n_hx_wx_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToKNHoWoBlockClusterAdaptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto K0 = Number<K / KPerBlock>{};
+        const auto N0 = Number<N / NPerBlock>{};
+        const auto H0 = Number<Ho / HoPerBlock>{};
+        const auto W0 = Number<Wo / WoPerBlock>{};
+#else
+        const auto K0 = K / KPerBlock;
+        const auto N0 = N / NPerBlock;
+        const auto H0 = Ho / HoPerBlock;
+        const auto W0 = Wo / WoPerBlock;
+#endif
+
+        const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
+            make_tuple(Sequence<0, 1, 2, 3>{}),
+            make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_k_n_ho_wo_block_cluster_adaptor;
+    }
+
+    // using AGridDesc_E0_E1_K0_K1_E2 =
+    // decltype(MakeAE0E1K0K1E2GridDescriptor(AGridDesc_E0_E1_K_E2{}));
+    // using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
+    // decltype(MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(BGridDesc_E0_E1_N_Ho_Wo_E2{}));
+    // using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 =
+    // decltype(MakeCK0K1NH0H1H2W0W1W2GridDescriptor(CGridDesc_K_N_Ho_Wo{}));
+    // using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx =
+    // decltype(MakeDK0K1NH0H1HxW0W1WxGridDescriptor(DGridDesc_K_N_Hx_Wx{}));
+
+    using CBlockIdToBlockClusterAdaptor_K_N_H_W =
+        decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
+
+    template <typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
+    __host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
+    {
+        const auto K0 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I0);
+        const auto K1 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I1);
+
+        return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
+    }
+
+    __host__ __device__ static constexpr auto MakeCK1NH2W2ThreadDescriptor()
+    {
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<KPerThread>{}, I1, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        return c_k1_n_h2_w2_thread_gemm_desc;
+    }
+
+    // using CThreadDesc_K1_N_H2_W2 = decltype(MakeCK1NH2W2ThreadDescriptor());
+
+    __host__ __device__ static constexpr auto GetBlockWiseGemm()
+    {
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
+                                                           I1,
+                                                           Number<HoPerBlock>{},
+                                                           Number<WoPerBlock>{},
+                                                           Number<E2>{}));
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e1_k1_e2_block_gemm_desc),
+                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
+                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                                 EPerThread,
+                                                 K2>{};
+
+        return blockwise_gemm;
+    }
+
+    __device__ static constexpr auto GetCThreadIndex()
+    {
+        auto blockwise_gemm = GetBlockWiseGemm();
+        auto c_thread_mtx_index =
+            blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
+
+        return c_thread_mtx_index;
+    };
+
+    __device__ static constexpr auto GetCBlockIndex(
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor)
+    {
+        const auto c_k_n_h_w_block_cluster_idx =
+            c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+        return c_k_n_h_w_block_cluster_idx;
+    }
+
+    template <typename BiasGlobalBuff,
+              typename CThreadBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename BiasGridDesc_K0_K1,
+              typename CThreadDesc_K1_N_H2_W2>
+    __device__ static void BiasOp(BiasGlobalBuff& bias_global_buf,
+                                  CThreadBuff& c_thread_buf,
+                                  const CBlockIndex& c_block_idx,
+                                  const CThreadIndex& c_thread_idx,
+                                  const BiasGridDesc_K0_K1& bias_k0_k1_grid_desc,
+                                  const CThreadDesc_K1_N_H2_W2&)
+
+    {
+        const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+
+        const auto k_thread_id = c_thread_idx[I0];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        constexpr auto bias_k0_k1_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatC,
+                     bias_k0_k1_thread_desc.GetElementSpaceSize(),
+                     true>
+            bias_thread_buf;
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        auto bias_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatC,
+                                             FloatC,
+                                             decltype(bias_k0_k1_grid_desc),
+                                             decltype(bias_k0_k1_thread_desc),
+                                             Sequence<I1, Number<KPerThread>{}>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             CThreadTransferDstScalarPerVector,
+                                             false,
+                                             true>(
+                bias_k0_k1_grid_desc, make_multi_index(k_block_work_id, k_thread_data_on_global));
+
+        constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
+            make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
+
+        bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
+                                     bias_global_buf,
+                                     bias_k0_k1_thread_desc,
+                                     make_tuple(I0, I0),
+                                     bias_thread_buf,
+                                     bias_k0_k1_global_tensor_step_hacks);
+
+        static_for<0, KPerThread, 1>{}([&](auto ki) {
+            static_for<0, HoPerThread, 1>{}([&](auto hi) {
+                static_for<0, WoPerThread, 1>{}([&](auto wi) {
+                    constexpr index_t c_offset =
+                        c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(make_tuple(ki, 0, hi, wi));
+                    c_thread_buf(Number<c_offset>{}) =
+                        c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
+                });
+            });
+        });
+    }
+
+    template <typename CThreadBuff, typename CThreadDesc_K1_N_H2_W2, ActivTypeEnum_t activ_type_>
+    __device__ static void Activation(CThreadBuff& c_thread_buf,
+                                      const CThreadDesc_K1_N_H2_W2&,
+                                      integral_constant<ActivTypeEnum_t, activ_type_>)
+    {
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
+            if constexpr(activ_type_ == 1)
+            {
+                c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : alpha * c_thread_buf[i];
+            }
+            else if constexpr(activ_type_ == 2)
+            {
+                FloatAcc x = 1.0 + exp(-c_thread_buf[i]);
+
+                asm volatile("\n \
+                        v_rcp_f32 %0, %1 \n"
+                             : "=v"(x)
+                             : "0"(x));
+
+                c_thread_buf(i) = x;
+            }
+        });
+    }
+
+    template <typename CThreadBuff,
+              typename CGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
+    __device__ static void
+    WriteOut(const CThreadBuff& c_thread_buf,
+             CGlobalBuff& c_global_buf,
+             const CBlockIndex& c_block_idx,
+             const CThreadIndex& c_thread_idx,
+             const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
+    {
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
+        // tensor
+        constexpr auto c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = CGlobalStepHacks{};
+
+        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread>{}));
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatAcc,
+            FloatC,
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread, I1, I1, WoPerThread>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            CGlobalMemoryDataOperation,
+            1,
+            true>(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 c_thread_buf,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                 c_global_buf,
+                 c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks);
+    }
+
+    template <typename CThreadBuff,
+              typename DGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CThreadDesc_K1_N_H2_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
+    __device__ static void
+    MaxPool(const CThreadBuff& c_thread_buf,
+            DGlobalBuff& d_global_buf,
+            const CBlockIndex& c_block_idx,
+            const CThreadIndex& c_thread_idx,
+            const CThreadDesc_K1_N_H2_W2&,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
+    {
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, "");
+
+        constexpr auto HoPerThread_2 = HoPerThread / 2;
+        constexpr auto WoPerThread_2 = WoPerThread / 2;
+
+        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread_2>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread_2>{}));
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatC,
+                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
+                     true>
+            d_thread_buf;
+
+        static_for<0, KPerThread, 1>{}([&](auto ki) {
+            static_for<0, HoPerThread_2, 1>{}([&](auto hi) {
+                static_for<0, WoPerThread_2, 1>{}([&](auto wi) {
+                    constexpr index_t d_offset =
+                        d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
+                            make_tuple(0, ki, 0, 0, 0, hi, 0, 0, wi));
+
+                    constexpr index_t c_offset_0 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2, wi * 2));
+                    constexpr index_t c_offset_1 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2, wi * 2 + 1));
+                    constexpr index_t c_offset_2 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2 + 1, wi * 2));
+                    constexpr index_t c_offset_3 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2 + 1, wi * 2 + 1));
+
+                    d_thread_buf(Number<d_offset>{}) = c_thread_buf[Number<c_offset_0>{}];
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmaxf(c_thread_buf[Number<c_offset_1>{}], d_thread_buf(Number<d_offset>{}));
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmaxf(c_thread_buf[Number<c_offset_2>{}], d_thread_buf(Number<d_offset>{}));
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmax(c_thread_buf[Number<c_offset_3>{}], d_thread_buf(Number<d_offset>{}));
+                });
+            });
+        });
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatC,
+            FloatC,
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread_2, I1, I1, WoPerThread_2>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            InMemoryDataOperationEnum_t::Set,
+            1,
+            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 d_thread_buf,
+                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                 d_global_buf,
+                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
+    }
+
+    template <typename CThreadBuff,
+              typename DGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CThreadDesc_K1_N_H2_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
+    __device__ static void
+    ResizeAdd(const CThreadBuff& c_thread_buf,
+              DGlobalBuff& d_global_buf,
+              const CBlockIndex& c_block_idx,
+              const CThreadIndex& c_thread_idx,
+              const CThreadDesc_K1_N_H2_W2&,
+              const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
+    {
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        constexpr auto HoPerThreadx2 = HoPerThread * 2;
+        constexpr auto WoPerThreadx2 = WoPerThread * 2;
+
+        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThreadx2>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThreadx2>{}));
+
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatC,
+                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
+                     true>
+            d_thread_buf;
+
+        static_for<0, KPerThread, 1>{}([&](auto k_i) {
+            static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
+                static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) {
+                    d_thread_buf(Number<d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
+                                     make_tuple(0, k_i, 0, 0, 0, h_i, 0, 0, w_i))>{}) =
+                        c_thread_buf[Number<c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                            make_tuple(k_i, 0, h_i / 2, w_i / 2))>{}];
+                });
+            });
+        });
+
+        // hack to control index calculation when iterating over d_k_n_ho_wo_global tensor
+        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatC,
+            FloatC,
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThreadx2, I1, I1, WoPerThreadx2>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            InMemoryDataOperationEnum_t::Add,
+            1,
+            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 d_thread_buf,
+                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                 d_global_buf,
+                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
+    }
+
+    template <typename AGlobalBuff,
+              typename BGlobalBuff,
+              typename CThreadBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CThreadDesc_K1_N_H2_W2,
+              bool HasMainE0BlockLoop>
+    __device__ static void
+    GemmOp(const AGlobalBuff& a_global_buf,
+           const BGlobalBuff& b_global_buf,
+           CThreadBuff& c_thread_buf,
+           FloatAB* __restrict__ p_shared_block,
+           const CBlockIndex& c_block_idx,
+           const CThreadIndex& c_thread_idx,
+           const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+           const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+           const CThreadDesc_K1_N_H2_W2&,
+           integral_constant<bool, HasMainE0BlockLoop>)
+    {
+        constexpr auto HasMainE1BlockLoop       = CalculateHasMainE1BlockLoop();
+        constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
+
+        // const auto c_k_n_h_w_block_cluster_idx =
+        // GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        // c_blockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
+        // make_multi_index(get_block_1d_id()));
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
+                                                           I1,
+                                                           Number<HoPerBlock>{},
+                                                           Number<WoPerBlock>{},
+                                                           Number<E2>{}));
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e1_k1_e2_block_gemm_desc),
+                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
+                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                                 EPerThread,
+                                                 K2>{};
+        // blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
+
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto a_e0_e1_k0_k1_e2_block_copy_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<I1>{}, Number<E1>{}, I1, Number<KPerBlock>{}, Number<E2>{}),
+            max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum_t::Set,
+                                            Sequence<I1, E1, I1, KPerBlock, E2>,
+                                            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+                                            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e0_e1_k0_k1_e2_grid_desc),
+                                            decltype(a_e0_e1_k0_k1_e2_block_copy_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2, 3, 4>,
+                                            ABlockTransferSrcVectorDim,
+                                            4,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_E2,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            false>(a_e0_e1_k0_k1_e2_grid_desc,
+                                                   make_multi_index(0, 0, k_block_work_id, 0, 0),
+                                                   a_e0_e1_k0_k1_e2_block_copy_desc,
+                                                   make_multi_index(0, 0, 0, 0, 0));
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0, 0);
+
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<E1PerBlock>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread>{},
+                                                           Number<E2>{}));
+
+        auto b_threadwise_transfer = ThreadwiseTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc),
+            Sequence<I1, E1PerBlock, I1, I1, I1, HoPerThread, I1, I1, WoPerThread, E2>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                  make_multi_index(0,
+                                   0,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0,
+                                   0));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
+            p_shared_block, a_e0_e1_k0_k1_e2_block_copy_desc.GetElementSpaceSize());
+
+        //// register allocation for output
+        // StaticBuffer<AddressSpaceEnum_t::Vgpr,
+        // FloatAcc,
+        // c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+        // true>
+        // c_thread_buf;
+
+        // initialize output thread tensor
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                    Sequence<KPerThread, I1, HoPerThread, WoPerThread>>{}
+            .Run(c_k1_n_h2_w2_thread_gemm_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto b_thread_slice_copy_step =
+            make_multi_index(0, E1PerBlock, 0, 0, 0, 0, 0, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e0_e1_k_e2_global_step_hacks                   = AGlobalStepHacks{};
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = BGlobalStepHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAB,
+                     b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc.GetElementSpaceSize(),
+                     true>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        if constexpr(HasMainE0BlockLoop)
+        {
+            const auto E0 = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I0);
+
+            index_t e0_block_data_begin = 0;
+
+            do
+            {
+                // LDS double buffer: preload data
+                {
+                    a_blockwise_copy.RunRead(
+                        a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_even_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
+                }
+
+                __syncthreads();
+
+                if constexpr(HasMainE1BlockLoop)
+                {
+                    index_t e1_block_data_begin = 0;
+
+                    // LDS double buffer: main body
+                    // use Do-While loop instead of For loop to simplify control flow
+                    do
+                    {
+                        // even iteration
+                        b_threadwise_transfer.MoveSrcSliceWindow(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_thread_slice_copy_step,
+                            BGlobalMoveSliceWindowStepHacks{});
+
+                        b_threadwise_transfer.Run(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_global_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                            b_thread_odd_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                        // LDS double buffer: GEMM on current data
+                        blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                        b_threadwise_transfer.MoveSrcSliceWindow(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_thread_slice_copy_step,
+                            BGlobalMoveSliceWindowStepHacks{});
+
+                        b_threadwise_transfer.Run(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_global_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                            b_thread_even_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                        // LDS double buffer: GEMM on current data
+                        blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                        e1_block_data_begin += 2 * E1PerBlock;
+
+                    } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
+                }
+
+                // LDS double buffer: tail
+                if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
+                {
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_odd_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on 2nd-last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    // LDS double buffer: GEMM on last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+                }
+                else // if has 1 iteration left
+                {
+                    // LDS double buffer: GEMM on last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+                }
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k0_k1_e2_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGlobalMoveSliceWindowStepHacks{});
+
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                                         b_thread_slice_copy_step,
+                                                         BGlobalMoveSliceWindowStepHacks{});
+
+                e0_block_data_begin += 1;
+
+            } while(e0_block_data_begin < E0);
+        }
+        else
+        {
+            // LDS double buffer: preload data
+            {
+                a_blockwise_copy.RunRead(
+                    a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
+
+                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                          b_global_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
+            }
+
+            __syncthreads();
+
+            if constexpr(HasMainE1BlockLoop)
+            {
+                index_t e1_block_data_begin = 0;
+
+                // LDS double buffer: main body
+                // use Do-While loop instead of For loop to simplify control flow
+                do
+                {
+                    // even iteration
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_odd_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on current data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_even_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on current data
+                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    e1_block_data_begin += 2 * E1PerBlock;
+
+                } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
+            }
+
+            // LDS double buffer: tail
+            if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
+            {
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                                         b_thread_slice_copy_step,
+                                                         BGlobalMoveSliceWindowStepHacks{});
+
+                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                          b_global_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                // LDS double buffer: GEMM on 2nd-last data
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+            }
+            else // if has 1 iteration left
+            {
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+            }
+        }
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop>
+    __device__ static void
+    Conv(const FloatAB* __restrict__ p_a_global,
+         const FloatAB* __restrict__ p_b_global,
+         const FloatC* __restrict__ p_bias_global,
+         FloatC* __restrict__ p_c_global,
+         FloatC* __restrict__ p_d_global,
+         FloatAB* __restrict__ p_shared_block,
+         const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+         const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+         const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+         const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+         const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
+         integral_constant<bool, HasMainE0BlockLoop>)
+    {
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum_t ActivType>
+    __device__ static void ConvBiasActiv(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_c_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum_t, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum_t ActivType>
+    __device__ static void ConvBiasActivMaxpool(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_c_global,
+        FloatC* __restrict__ p_d_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum_t, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        // MaxPool
+        MaxPool(c_thread_buf,
+                d_global_buf,
+                c_k_n_h_w_block_cluster_idx,
+                c_thread_mtx_index,
+                c_k1_n_h2_w2_thread_gemm_desc,
+                d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum_t ActivType>
+    __device__ static void ConvBiasActivResizeAdd(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_d_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum_t, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum_t::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Resize_Add
+        ResizeAdd(c_thread_buf,
+                  d_global_buf,
+                  c_k_n_h_w_block_cluster_idx,
+                  c_thread_mtx_index,
+                  c_k1_n_h2_w2_thread_gemm_desc,
+                  d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -9,21 +9,22 @@ namespace ck {
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:
-//   1. ADesc, BDesc, CDesc are known at compile-time
+//   1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at
+//   compile-time
 //   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
 template <typename FloatA,
          typename FloatB,
          typename FloatC,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc,
-          index_t H,
-          index_t W,
-          typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                 CDesc::IsKnownAtCompileTime(),
+          typename AThreadDesc_E1_K_E2,
+          typename BThreadDesc_E1_N_Ho_Wo_E2,
+          typename CThreadDesc_K_N_Ho_Wo,
+          typename enable_if<AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
+                                 BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                                 CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
                             bool>::type = false>
 struct ThreadwiseGemmDlops_km_kn_mn_v3
 {
+
    template <typename ABuffer,
              typename AOriginIdx,
              typename BBuffer,
@@ -37,8 +38,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
                               CBuffer& c_buf,
                               COriginIdx)
    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
+
+        static_assert(AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
+                          BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
                      "wrong! Desc should be known at compile-time");

        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
@@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};

-        constexpr auto E = ADesc{}.GetLength(I0);
-        constexpr auto K = ADesc{}.GetLength(I1);
+        constexpr auto E1 = AThreadDesc_E1_K_E2{}.GetLength(I0);
+        constexpr auto K  = AThreadDesc_E1_K_E2{}.GetLength(I1);
+        constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);
+
+        constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+        constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);

        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});

-        static_for<0, E, 1>{}([&](auto e) {
+        if constexpr((Ho % 2 == 0) && (Wo % 2 == 0))
+        {
+            constexpr auto SubHW = 2;
+
            static_for<0, K, 1>{}([&](auto k) {
-                constexpr index_t a_offset =
-                    ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
+                static_for<0, Ho, SubHW>{}([&](auto h) {
+                    static_for<0, Wo, SubHW>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));

-                if constexpr(H == 2 && W == 2)
-                {
-                    constexpr index_t b_offset_0 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
-                    constexpr index_t b_offset_2 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_3 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
-
-                    constexpr index_t c_offset_0 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
-                    constexpr index_t c_offset_2 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_3 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
+                                constexpr index_t b0_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));

-                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
-                                                   b_buf[Number<b_offset_0>{}],
-                                                   b_buf[Number<b_offset_1>{}],
-                                                   b_buf[Number<b_offset_2>{}],
-                                                   b_buf[Number<b_offset_3>{}],
-                                                   c_buf(Number<c_offset_0>{}),
-                                                   c_buf(Number<c_offset_1>{}),
-                                                   c_buf(Number<c_offset_2>{}),
-                                                   c_buf(Number<c_offset_3>{}));
-                }
-                else if constexpr(H == 4 && W == 1)
-                {
-                    constexpr index_t b_offset_0 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_2 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
-                    constexpr index_t b_offset_3 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
-
-                    constexpr index_t c_offset_0 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_2 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
-                    constexpr index_t c_offset_3 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
+                                constexpr index_t b1_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w + 1, e2));
+
+                                constexpr index_t b2_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w, e2));
+
+                                constexpr index_t b3_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2));
+
+                                constexpr index_t c0_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
+                                                                            make_tuple(k, 0, h, w));
+
+                                constexpr index_t c1_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h, w + 1));
+
+                                constexpr index_t c2_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w));
+
+                                constexpr index_t c3_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w + 1));

                                amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
-                                                   b_buf[Number<b_offset_0>{}],
-                                                   b_buf[Number<b_offset_1>{}],
-                                                   b_buf[Number<b_offset_2>{}],
-                                                   b_buf[Number<b_offset_3>{}],
-                                                   c_buf(Number<c_offset_0>{}),
-                                                   c_buf(Number<c_offset_1>{}),
-                                                   c_buf(Number<c_offset_2>{}),
-                                                   c_buf(Number<c_offset_3>{}));
+                                                               b_buf[Number<b0_offset>{}],
+                                                               b_buf[Number<b1_offset>{}],
+                                                               b_buf[Number<b2_offset>{}],
+                                                               b_buf[Number<b3_offset>{}],
+                                                               c_buf(Number<c0_offset>{}),
+                                                               c_buf(Number<c1_offset>{}),
+                                                               c_buf(Number<c2_offset>{}),
+                                                               c_buf(Number<c3_offset>{}));
+                            });
+                        });
+                    });
+                });
+            });
        }
        else
        {
-                    static_for<0, H, 1>{}([&](auto h) {
-                        static_for<0, W, 1>{}([&](auto w) {
+
+            static_for<0, K, 1>{}([&](auto k) {
+                static_for<0, Ho, 1>{}([&](auto h) {
+                    static_for<0, Wo, 1>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));
+
                                constexpr index_t b_offset =
-                                BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));

                                constexpr index_t c_offset =
-                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
+                                                                            make_tuple(k, 0, h, w));

-#if 0
-                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
-                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
-#else
-                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
+                                inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
                                                                      b_buf[Number<b_offset>{}],
                                                                      c_buf(Number<c_offset>{}));
-#endif
                            });
                        });
-                }
                    });
                });
+            });
+        }
    }
 };


--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3
                    is_dst_valid,
                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
            }
+            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
+            {
+
+                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
+                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
+                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
+
+                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
+                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
+                });
+
+                dst_buf.template Set<dst_vector_t>(
+                    dst_coord_.GetOffset(),
+                    is_dst_valid,
+                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+            }

            constexpr auto move_on_dim = [&]() constexpr
            {
@@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2
        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
    }

+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
    private:
    SrcCoord src_coord_;
 }; // namespace ck

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 8)
        {
+#if 0
            vector_type<half_t, 8> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
@@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+#endif
        }
    }
    else if constexpr(is_same<T, ushort>::value)

--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -96,6 +96,7 @@
 // pass tensor descriptor by value or void*
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
+#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0

 // merge transformation use magic number division
 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
@@ -128,7 +129,15 @@ namespace ck {
 enum InMemoryDataOperationEnum_t
 {
    Set,
-    AtomicAdd
+    AtomicAdd,
+    Add
+};
+
+enum ActivTypeEnum_t
+{
+    None = 0,
+    LeakyRelu,
+    Sigmoid
 };

 // index type

--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -13,16 +13,25 @@ include_directories(BEFORE
 )

 set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
+set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp)
+set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp)
+set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
 set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
 set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)

 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
+add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
+add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
 add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})

 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor)
+target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor)
+target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
 target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename AddLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const AddLengths& add_n_k0_hox2_wox2_k1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
+    Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+
+    const auto Hox2 = add_n_k0_hox2_wox2_k1_lengths[I2];
+    const auto Wox2 = add_n_k0_hox2_wox2_k1_lengths[I3];
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
+
+    constexpr index_t InWeiVectorSize = 8;
+
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr auto BlockSize = 64;
+
+    constexpr auto KPerBlock  = 8;
+    constexpr auto HoPerBlock = 8;
+    constexpr auto WoPerBlock = 32;
+
+    constexpr auto E1         = 2 * 9;
+    constexpr auto E2         = 1;
+    constexpr auto K2         = 2;
+    constexpr auto E1PerBlock = 2;
+
+    constexpr auto KPerThread  = KPerBlock;
+    constexpr auto HoPerThread = 2;
+    constexpr auto WoPerThread = 2;
+    constexpr auto EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+
+    constexpr auto ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr auto ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr auto CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto add_n_k0_hox2_wox2_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+
+    std::cerr << "conv_bias_activ_resize_add_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0
+              << "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl;
+
+    for(int i = 0; i < 5; i++)
+    {
+
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            add_n_k0_hox2_wox2_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
+
+    conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                    in_n_c0_hi_wi_c1_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    add_n_k0_hox2_wox2_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                    0);
+
+    add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    Tensor<TOut>& out_n_k0_ho_wo_k1,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+
+    constexpr index_t InWeiVectorSize = 8;
+
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = 8;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t E1         = 2 * 9;
+    constexpr index_t E2         = 1;
+    constexpr index_t K2         = 2;
+    constexpr index_t E1PerBlock = 2;
+
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+
+    if(KPerThread % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+
+    std::cerr << "conv_bias_activ_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
+
+    for(int i = 0; i < 5; i++)
+    {
+
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
-
-template <typename TInWei,
-          ck::index_t InWeiVectorSize,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t /* nrepeat */)
-{
-    using namespace ck;
-
-    std::cout << __func__ << std::endl;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const auto N = out_n_k_ho_wo_lengths[I0];
-    const auto K = out_n_k_ho_wo_lengths[I1];
-    const auto C = wei_k_c_y_x_lengths[I1];
-
-    const auto Hi = in_n_c_hi_wi_lengths[I2];
-    const auto Wi = in_n_c_hi_wi_lengths[I3];
-
-    const auto Ho = out_n_k_ho_wo_lengths[I2];
-    const auto Wo = out_n_k_ho_wo_lengths[I3];
-
-    const auto Y = wei_k_c_y_x_lengths[I2];
-    const auto X = wei_k_c_y_x_lengths[I3];
-
-    const auto C0 = C / Number<InWeiVectorSize>{};
-    const auto C1 = Number<InWeiVectorSize>{};
-
-    const auto K0 = K / Number<InWeiVectorSize>{};
-    const auto K1 = Number<InWeiVectorSize>{};
-
-    Tensor<TInWei> in_n_c0_hi_wi_c1(
-        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
-    Tensor<TInWei> wei_k_c0_y_x_c1(
-        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
-    Tensor<TOut> out_n_k0_ho_wo_k1(
-        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
-
-    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
-        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
-            in_n_c_hi_wi(n, c, hi, wi);
-    };
-
-    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
-        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
-            wei_k_c_y_x(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
-    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
-
-    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
-                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
-    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
-
-    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
-    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
-
-    const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
-    const auto wei_k_c0_y_x_desc  = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
-    const auto out_n_k0_ho_wo_k1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
-
-#if 1
-    // cdata = 64, BlockSize = 64, 16x8x32x4
-    constexpr index_t BlockSize = 64;
-
-    constexpr index_t KPerBlock  = 16;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t EPerBlock  = 1;
-
-    constexpr index_t KPerThread  = KPerBlock;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = EPerBlock;
-
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<3, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
-
-    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
-
-    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
-
-    constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
-
-    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
-#else
-    constexpr index_t BlockSize = 64;
-
-    constexpr index_t KPerBlock  = 16;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t EPerBlock  = 1;
-
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = EPerBlock;
-
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
-
-    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
-
-    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
-
-    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
-
-    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
-#endif
-
-    constexpr auto conv_driver =
-#if 0
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
-#else
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
-#endif
-        <BlockSize,
-         typename vector_type<TInWei, InWeiVectorSize>::type,
-         TAcc,
-         TOut,
-         KPerBlock,
-         HoPerBlock,
-         WoPerBlock,
-         EPerBlock,
-         KPerThread,
-         HoPerThread,
-         WoPerThread,
-         EPerThread,
-         ABlockTransferThreadSliceLengths_E_K,
-         ABlockTransferThreadClusterLengths_E_K,
-         ABlockTransferSrcScalarPerVector_E,
-         ABlockTransferDstScalarPerVector_K,
-         BThreadTransferSrcScalarPerVector_W,
-         CThreadTransferDstScalarPerVector_W>{};
-
-    conv_driver.Run(wei_k_c0_y_x_desc,
-                    in_n_c0_hi_wi_desc,
-                    out_n_k0_ho_wo_k1_desc,
-                    conv_strides,
-                    conv_dilations,
-                    in_left_pads,
-                    in_right_pads,
-                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
-                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
-                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
-
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
-
-    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
-        out_n_k_ho_wo(n, k, ho, wo) =
-            out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
-    };
-
-    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
-}
--- a/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename MaxLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const MaxLengths& max_n_k0_hx_wx_k1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    Tensor<TOut>& out_n_k0_ho_wo_k1,
+    Tensor<TOut>& max_n_k0_hx_wx_k1,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+
+    const auto Hx = max_n_k0_hx_wx_k1_lengths[I2];
+    const auto Wx = max_n_k0_hx_wx_k1_lengths[I3];
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+    DeviceMem max_n_k0_hx_wx_k1_device_buf(sizeof(TOut) *
+                                           max_n_k0_hx_wx_k1.mDesc.GetElementSpace());
+
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+    max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
+
+    constexpr index_t InWeiVectorSize = 8;
+
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = 8;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t E1         = 2 * 9;
+    constexpr index_t E2         = 1;
+    constexpr index_t K2         = 2;
+    constexpr index_t E1PerBlock = 2;
+
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+
+    if(KPerThread % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto max_n_k0_hx_wx_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+
+    std::cerr << "conv_bias_activ_maxpool_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
+              << Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
+
+    for(int i = 0; i < 5; i++)
+    {
+
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            max_n_k0_hx_wx_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(max_n_k0_hx_wx_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+    max_n_k0_hx_wx_k1_device_buf.FromDevice(max_n_k0_hx_wx_k1.mData.data());
+}
--- a/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#ifndef DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+#define DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t E1_,
+          ck::index_t E2_,
+          ck::index_t K2_,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t E1PerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+          ck::index_t ABlockTransferSrcScalarPerVector_E2,
+          ck::index_t ABlockTransferDstScalarPerVector_E2,
+          ck::index_t BThreadTransferSrcScalarPerVector_E2,
+          ck::index_t CThreadTransferDstScalarPerVector_K,
+          ck::ActivTypeEnum_t activ_type>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Add,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
+                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
+                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                       const ck::TensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
+                       const ConvStrides& conv_strides,
+                       const ConvDilations& conv_dilations,
+                       const InLeftPads& in_left_pads,
+                       const InRightPads& in_right_pads,
+                       const FloatAB* __restrict__ p_a_grid,
+                       const FloatAB* __restrict__ p_b_grid,
+                       const FloatC* __restrict__ p_bias_grid,
+                       FloatC* __restrict__ p_d_grid,
+                       const int nrepeat) const
+    {
+        using namespace ck;
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
+        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
+        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
+        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
+        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
+
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto Hox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I2);
+        const auto Wox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I3);
+
+        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
+        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
+        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto OutRightPadHx = Number<OutRightPadH * 2>{};
+        const auto OutRightPadWx = Number<OutRightPadW * 2>{};
+#else
+        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
+        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto OutRightPadHx = OutRightPadH * 2;
+        const auto OutRightPadWx = OutRightPadW * 2;
+#endif
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        const auto E = C0 * Y * X;
+
+        constexpr auto E1 = Number<E1_>{};
+        constexpr auto E2 = Number<E2_>{};
+        constexpr auto K2 = Number<K2_>{};
+
+        const auto E0 = E / E1;
+
+        // weight tensor
+        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
+            make_tuple(make_pass_through_transform(K),
+                       make_pass_through_transform(C0 * Y * X),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
+
+        const auto a_e0_e1_k_e2_grid_desc =
+            transform_tensor_descriptor(a_e_k_e2_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                                                   make_pass_through_transform(K),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // input tensor
+        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C0),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
+            in_n_c0_hip_wip_e2_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C0),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
+                make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
+
+        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_n_c0_y_ho_x_wo_e2_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(
+                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_e_n_ho_wo_e2_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
+
+        // output tensor
+        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, I0, OutRightPadH),
+                       make_pad_transform(Wo, I0, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // add tensor
+        const auto d_k_n_hopx2_wopx2_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Hox2, I0, OutRightPadHx),
+                       make_pad_transform(Wox2, I0, OutRightPadWx)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E1 % E1PerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // clang-format off
+
+        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
+        constexpr auto a_e0_e1_k_e2_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
+            make_tuple(
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
+            ); 
+
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
+        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        // clang-format on
+
+        // GEMM
+        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(a_e0_e1_k_e2_grid_desc),
+            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
+            decltype(c_k_n_hop_wop_grid_desc),
+            decltype(d_k_n_hopx2_wopx2_grid_desc),
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            Sequence<2, 3, 0, 1, 4>,
+            Sequence<0, 1, 2, 3, 4>,
+            4,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
+            9,
+            BThreadTransferSrcScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
+            1,
+            CThreadTransferDstScalarPerVector_K,
+            decltype(a_e0_e1_k_e2_global_step_hacks),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
+            decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks),
+            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
+
+        const auto a_e0_e1_k0_k1_e2_grid_desc =
+            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
+        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
+        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
+        const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
+            GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(
+                d_k_n_hopx2_wopx2_grid_desc);
+
+        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
+        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+        using DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2 =
+            decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
+
+        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        const bool has_main_e0_block_loop = E0 > 1;
+
+        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
+
+        const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
+
+        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
+            decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        float ave_time = 0;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+
+        if(has_main_e0_block_loop)
+        {
+            const auto kernel = kernel_gemm_dlops_v3_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                true,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_d_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+        else
+        {
+            const auto kernel = kernel_gemm_dlops_v3_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                false,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_d_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+        DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
+        DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
+            sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
+        DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
+            sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
+        DeviceMem d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf(
+            sizeof(DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2));
+        DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
+            sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
+
+        a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
+        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
+            &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
+            &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+        d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.ToDevice(
+            &d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
+        c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
+            &c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        if(has_main_e0_block_loop)
+        {
+
+            const auto kernel = kernel_gemm_dlops_v3_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                true,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_d_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+        else
+        {
+            const auto kernel = kernel_gemm_dlops_v3_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                false,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_d_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        {
+            static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
+
+            const auto kernel = kernel_gemm_dlops_v3_resize_add<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                has_main_e0_block_loop,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_d_grid);
+        }
+#endif
+        return ave_time;
+    }
+};
+#endif
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t E1_,
+          ck::index_t E2_,
+          ck::index_t K2_,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t E1PerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+          ck::index_t ABlockTransferSrcScalarPerVector_E2,
+          ck::index_t ABlockTransferDstScalarPerVector_E2,
+          ck::index_t BThreadTransferSrcScalarPerVector_E2,
+          ck::index_t CThreadTransferDstScalarPerVector_K,
+          ck::ActivTypeEnum_t activ_type>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
+                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
+                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                       const ConvStrides& conv_strides,
+                       const ConvDilations& conv_dilations,
+                       const InLeftPads& in_left_pads,
+                       const InRightPads& in_right_pads,
+                       const FloatAB* __restrict__ p_a_grid,
+                       const FloatAB* __restrict__ p_b_grid,
+                       const FloatC* __restrict__ p_bias_grid,
+                       FloatC* __restrict__ p_c_grid,
+                       const int nrepeat) const
+    {
+        using namespace ck;
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
+        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
+        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
+        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
+        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
+
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
+        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
+        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+#else
+        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
+        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
+#endif
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        const auto E = C0 * Y * X;
+
+        constexpr auto E1 = Number<E1_>{};
+        constexpr auto E2 = Number<E2_>{};
+        constexpr auto K2 = Number<K2_>{};
+
+        const auto E0 = E / E1;
+
+        // weight tensor
+        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
+            make_tuple(make_pass_through_transform(K),
+                       make_pass_through_transform(C0 * Y * X),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
+
+        const auto a_e0_e1_k_e2_grid_desc =
+            transform_tensor_descriptor(a_e_k_e2_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                                                   make_pass_through_transform(K),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // input tensor
+        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C0),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
+            in_n_c0_hip_wip_e2_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C0),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
+                make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
+
+        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_n_c0_y_ho_x_wo_e2_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(
+                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_e_n_ho_wo_e2_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
+
+        // output tensor
+        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, I0, OutRightPadH),
+                       make_pad_transform(Wo, I0, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E1 % E1PerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // clang-format off
+
+        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
+        constexpr auto a_e0_e1_k_e2_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
+            make_tuple(
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
+            ); 
+
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
+        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        // clang-format on
+
+        // GEMM
+        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(a_e0_e1_k_e2_grid_desc),
+            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
+            decltype(c_k_n_hop_wop_grid_desc),
+            decltype(c_k_n_hop_wop_grid_desc),
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            Sequence<2, 3, 0, 1, 4>,
+            Sequence<0, 1, 2, 3, 4>,
+            4,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
+            9,
+            BThreadTransferSrcScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, H2, W0, W1, W2
+            1,
+            CThreadTransferDstScalarPerVector_K,
+            decltype(a_e0_e1_k_e2_global_step_hacks),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
+            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
+
+        const auto a_e0_e1_k0_k1_e2_grid_desc =
+            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
+        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
+        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
+
+        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
+        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        const bool has_main_e0_block_loop = E0 > 1;
+
+        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
+
+        const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
+
+        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
+            decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        float ave_time = 0;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+
+        if(has_main_e0_block_loop)
+        {
+            const auto kernel =
+                kernel_gemm_dlops_v3<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     true,
+                                     activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+        else
+        {
+            const auto kernel =
+                kernel_gemm_dlops_v3<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     false,
+                                     activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+        DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
+        DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
+            sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
+        DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
+            sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
+        DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
+            sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
+
+        a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
+        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
+            &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
+            &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+        c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
+            &c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        if(has_main_e0_block_loop)
+        {
+
+            const auto kernel =
+                kernel_gemm_dlops_v3<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     true,
+                                     activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_c_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+        else
+        {
+
+            const auto kernel =
+                kernel_gemm_dlops_v3<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     false,
+                                     activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_c_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        {
+            static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
+
+            const auto kernel =
+                kernel_gemm_dlops_v3<GridwiseGemm,
+                                     FloatAB,
+                                     FloatC,
+                                     remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                                     remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                                     remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                                     remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                                     has_main_e0_block_loop,
+                                     activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid);
+        }
+#endif
+        return ave_time;
+    }
+};
+#endif
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
-#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
-#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v2.hpp"
-#include "gridwise_operation_wrapper.hpp"
-
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t EPerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E_K,
-          typename ABlockTransferThreadClusterLengths_E_K,
-          ck::index_t ABlockTransferSrcScalarPerVector_E,
-          ck::index_t ABlockTransferDstScalarPerVector_K,
-          ck::index_t BThreadTransferSrcScalarPerVector_W,
-          ck::index_t CThreadTransferDstScalarPerVector_W>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
-{
-    template <typename... Wei,
-              typename... In,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                      const ConvStrides& conv_strides,
-                      const ConvDilations& conv_dilations,
-                      const InLeftPads& in_left_pads,
-                      const InRightPads& in_right_pads,
-                      const FloatAB* __restrict__ p_wei_global,
-                      const FloatAB* __restrict__ p_in_global,
-                      FloatC* __restrict__ p_out_global) const
-    {
-        using namespace ck;
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
-        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
-        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-
-        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-
-        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
-        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
-        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-
-        const auto InRightPadH = in_right_pads[I0];
-        const auto InRightPadW = in_right_pads[I1];
-
-        // weight tensor
-        const auto wei_e_k_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-        // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C),
-                make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_y_ho_x_wo_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Ho),
-                       make_pass_through_transform(Wo)),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        // output tensor
-        const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Ho),
-                       make_pass_through_transform(Wo)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const auto E = C * Y * X;
-
-        if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
-             (E % EPerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-
-        // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
-
-        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
-
-        constexpr auto b_e_n_ho_wo_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
-
-        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
-        // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}));
-
-#if 1
-        // GEMM
-        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum_t::Set,
-            decltype(wei_e_k_global_desc),
-            decltype(in_e_n_ho_wo_global_desc),
-            decltype(out_k_n_ho_wo_global_desc),
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            EPerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E_K,
-            ABlockTransferThreadClusterLengths_E_K,
-            Sequence<1, 0>,
-            Sequence<1, 0>,
-            0,
-            ABlockTransferSrcScalarPerVector_E,
-            ABlockTransferDstScalarPerVector_K,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 2, 3, 1>,
-            3,
-            BThreadTransferSrcScalarPerVector_W,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
-                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 2, 3, 1>,
-            0,
-            CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_step_hacks),
-            decltype(b_e_n_ho_wo_global_step_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
-            decltype(a_e_k_global_move_slice_window_step_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
-
-        const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
-
-        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
-
-        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
-
-        index_t nrepeat = 100;
-
-        for(index_t i = 0; i < 5; ++i)
-        {
-            std::cout << "Start running " << nrepeat << " times..." << std::endl;
-
-            KernelTimer timer;
-            timer.Start();
-            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
-                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
-                      << std::endl;
-
-            for(index_t j = 0; j < nrepeat; ++j)
-            {
-                if(has_main_k_block_loop && has_double_tail_k_block_loop)
-                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
-                                                               decltype(wei_e_k_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(out_k_n_ho_wo_global_desc),
-                                                               FloatC*,
-                                                               integral_constant<bool, true>,
-                                                               integral_constant<bool, true>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_ho_wo_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, true>{},
-                                  integral_constant<bool, true>{});
-                }
-                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
-                                                               decltype(wei_e_k_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(out_k_n_ho_wo_global_desc),
-                                                               FloatC*,
-                                                               integral_constant<bool, true>,
-                                                               integral_constant<bool, false>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_ho_wo_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, true>{},
-                                  integral_constant<bool, false>{});
-                }
-                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
-                                                               decltype(wei_e_k_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(out_k_n_ho_wo_global_desc),
-                                                               FloatC*,
-                                                               integral_constant<bool, false>,
-                                                               integral_constant<bool, true>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_ho_wo_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, false>{},
-                                  integral_constant<bool, true>{});
-                }
-                else
-                {
-                    const auto kernel = run_gridwise_operation<gridwise_gemm,
-                                                               decltype(wei_e_k_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(in_e_n_ho_wo_global_desc),
-                                                               const FloatAB*,
-                                                               decltype(out_k_n_ho_wo_global_desc),
-                                                               FloatC*,
-                                                               integral_constant<bool, false>,
-                                                               integral_constant<bool, false>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_ho_wo_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, false>{},
-                                  integral_constant<bool, false>{});
-                }
-            }
-
-            timer.End();
-
-            float ave_time = timer.GetElapsedTime() / nrepeat;
-
-            float perf =
-                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                               wei_k_c_y_x_global_desc,
-                                                               out_n_k0_ho_wo_k1_global_desc)) /
-                (std::size_t(1000) * 1000 * 1000) / ave_time;
-
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-#endif
-    }
-};
-#endif
--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
-#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
-#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "gridwise_gemm_dlops_v2.hpp"
-#include "gridwise_operation_wrapper.hpp"
-
-template <ck::index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          ck::index_t KPerBlock,
-          ck::index_t HoPerBlock,
-          ck::index_t WoPerBlock,
-          ck::index_t EPerBlock,
-          ck::index_t KPerThread,
-          ck::index_t HoPerThread,
-          ck::index_t WoPerThread,
-          ck::index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E_K,
-          typename ABlockTransferThreadClusterLengths_E_K,
-          ck::index_t ABlockTransferSrcScalarPerVector_E,
-          ck::index_t ABlockTransferDstScalarPerVector_K,
-          ck::index_t BThreadTransferSrcScalarPerVector_W,
-          ck::index_t CThreadTransferDstScalarPerVector_W>
-struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
-{
-    template <typename... Wei,
-              typename... In,
-              typename... Out,
-              typename ConvStrides,
-              typename ConvDilations,
-              typename InLeftPads,
-              typename InRightPads>
-    __host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                      const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                      const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
-                      const ConvStrides& conv_strides,
-                      const ConvDilations& conv_dilations,
-                      const InLeftPads& in_left_pads,
-                      const InRightPads& in_right_pads,
-                      const FloatAB* __restrict__ p_wei_global,
-                      const FloatAB* __restrict__ p_in_global,
-                      FloatC* __restrict__ p_out_global) const
-    {
-        using namespace ck;
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
-        const auto N  = in_n_c_hi_wi_global_desc.GetLength(I0);
-        const auto C  = in_n_c_hi_wi_global_desc.GetLength(I1);
-        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
-
-        const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
-        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
-
-        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
-
-        const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
-        const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
-        const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-        const auto ConvStrideH = conv_strides[I0];
-        const auto ConvStrideW = conv_strides[I1];
-
-        const auto ConvDilationH = conv_dilations[I0];
-        const auto ConvDilationW = conv_dilations[I1];
-
-        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
-        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
-
-        const auto OutRightPadH = Hop - Ho;
-        const auto OutRightPadW = Wop - Wo;
-
-        const auto InLeftPadH = in_left_pads[I0];
-        const auto InLeftPadW = in_left_pads[I1];
-
-        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
-        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
-
-        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
-                  << std::endl;
-        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
-                  << std::endl;
-
-        // weight tensor
-        const auto wei_e_k_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
-            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-        // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(make_pass_through_transform(N),
-                       make_pass_through_transform(C),
-                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(
-                make_pass_through_transform(N),
-                make_pass_through_transform(C),
-                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
-                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-        const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_y_ho_x_wo_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
-                       make_pass_through_transform(N),
-                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop)),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        // output tensor
-        const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
-            make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                       make_pass_through_transform(N),
-                       make_pad_transform(Ho, 0, OutRightPadH),
-                       make_pad_transform(Wo, 0, OutRightPadW)),
-            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const auto E = C * Y * X;
-
-        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
-
-        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E % EPerBlock) == 0))
-        {
-            throw std::runtime_error("wrong! GEMM size no divisible");
-        }
-
-        // hack to control index calculation when iterating over a_k_m_global tensor
-        constexpr auto a_e_k_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
-
-        constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
-
-        constexpr auto b_e_n_ho_wo_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
-
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
-
-        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
-        // hack for NKHW format
-        constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}));
-
-        // GEMM
-        using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
-            BlockSize,
-            FloatAB,
-            FloatAcc,
-            FloatC,
-            InMemoryDataOperationEnum_t::Set,
-            decltype(wei_e_k_global_desc),
-            decltype(in_e_n_ho_wo_global_desc),
-            decltype(out_k_n_hop_wop_global_desc),
-            KPerBlock,
-            HoPerBlock,
-            WoPerBlock,
-            EPerBlock,
-            KPerThread,
-            HoPerThread,
-            WoPerThread,
-            EPerThread,
-            ABlockTransferThreadSliceLengths_E_K,
-            ABlockTransferThreadClusterLengths_E_K,
-            Sequence<1, 0>,
-            Sequence<1, 0>,
-            0,
-            ABlockTransferSrcScalarPerVector_E,
-            ABlockTransferDstScalarPerVector_K,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 2, 3, 1>,
-            3,
-            BThreadTransferSrcScalarPerVector_W,
-            false, // don't move back src coordinate after threadwise copy, which will be fused with
-                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<0, 2, 3, 1>,
-            0,
-            CThreadTransferDstScalarPerVector_W,
-            decltype(a_e_k_global_step_hacks),
-            decltype(b_e_n_ho_wo_global_step_hacks),
-            decltype(c_k_n_ho_wo_global_tensor_step_hacks),
-            decltype(a_e_k_global_move_slice_window_step_hack),
-            decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
-
-        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
-
-        const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
-
-        const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
-
-        index_t nrepeat = 100;
-
-        for(index_t i = 0; i < 5; ++i)
-        {
-            std::cout << "Start running " << nrepeat << " times..." << std::endl;
-
-            KernelTimer timer;
-            timer.Start();
-            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
-                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
-                      << std::endl;
-
-            for(index_t j = 0; j < nrepeat; ++j)
-            {
-                if(has_main_k_block_loop && has_double_tail_k_block_loop)
-                {
-                    const auto kernel =
-                        run_gridwise_operation<gridwise_gemm,
-                                               decltype(wei_e_k_global_desc),
-                                               const FloatAB*,
-                                               decltype(in_e_n_ho_wo_global_desc),
-                                               const FloatAB*,
-                                               decltype(out_k_n_hop_wop_global_desc),
-                                               FloatC*,
-                                               integral_constant<bool, true>,
-                                               integral_constant<bool, true>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_hop_wop_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, true>{},
-                                  integral_constant<bool, true>{});
-                }
-                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-                {
-                    const auto kernel =
-                        run_gridwise_operation<gridwise_gemm,
-                                               decltype(wei_e_k_global_desc),
-                                               const FloatAB*,
-                                               decltype(in_e_n_ho_wo_global_desc),
-                                               const FloatAB*,
-                                               decltype(out_k_n_hop_wop_global_desc),
-                                               FloatC*,
-                                               integral_constant<bool, true>,
-                                               integral_constant<bool, false>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_hop_wop_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, true>{},
-                                  integral_constant<bool, false>{});
-                }
-                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-                {
-                    const auto kernel =
-                        run_gridwise_operation<gridwise_gemm,
-                                               decltype(wei_e_k_global_desc),
-                                               const FloatAB*,
-                                               decltype(in_e_n_ho_wo_global_desc),
-                                               const FloatAB*,
-                                               decltype(out_k_n_hop_wop_global_desc),
-                                               FloatC*,
-                                               integral_constant<bool, false>,
-                                               integral_constant<bool, true>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_hop_wop_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, false>{},
-                                  integral_constant<bool, true>{});
-                }
-                else
-                {
-                    const auto kernel =
-                        run_gridwise_operation<gridwise_gemm,
-                                               decltype(wei_e_k_global_desc),
-                                               const FloatAB*,
-                                               decltype(in_e_n_ho_wo_global_desc),
-                                               const FloatAB*,
-                                               decltype(out_k_n_hop_wop_global_desc),
-                                               FloatC*,
-                                               integral_constant<bool, false>,
-                                               integral_constant<bool, false>>;
-
-                    launch_kernel(kernel,
-                                  dim3(GridSize),
-                                  dim3(BlockSize),
-                                  0,
-                                  wei_e_k_global_desc,
-                                  p_wei_global,
-                                  in_e_n_ho_wo_global_desc,
-                                  p_in_global,
-                                  out_k_n_hop_wop_global_desc,
-                                  p_out_global,
-                                  integral_constant<bool, false>{},
-                                  integral_constant<bool, false>{});
-                }
-            }
-
-            timer.End();
-
-            float ave_time = timer.GetElapsedTime() / nrepeat;
-
-            float perf =
-                static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
-                                                               wei_k_c_y_x_global_desc,
-                                                               out_n_k0_ho_wo_k1_global_desc)) /
-                (std::size_t(1000) * 1000 * 1000) / ave_time;
-
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-    }
-};
-#endif
--- a/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_dlops_v3.hpp"
+
+template <ck::index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          ck::index_t E1_,
+          ck::index_t E2_,
+          ck::index_t K2_,
+          ck::index_t KPerBlock,
+          ck::index_t HoPerBlock,
+          ck::index_t WoPerBlock,
+          ck::index_t E1PerBlock,
+          ck::index_t KPerThread,
+          ck::index_t HoPerThread,
+          ck::index_t WoPerThread,
+          ck::index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+          ck::index_t ABlockTransferSrcScalarPerVector_E2,
+          ck::index_t ABlockTransferDstScalarPerVector_E2,
+          ck::index_t BThreadTransferSrcScalarPerVector_E2,
+          ck::index_t CThreadTransferDstScalarPerVector_K,
+          ck::ActivTypeEnum_t activ_type>
+struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
+{
+    template <typename... Wei,
+              typename... In,
+              typename... MaxPool,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
+                       const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
+                       const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                       const ck::TensorDescriptor<MaxPool...>& max_n_k0_hx_wx_k1_global_desc,
+                       const ConvStrides& conv_strides,
+                       const ConvDilations& conv_dilations,
+                       const InLeftPads& in_left_pads,
+                       const InRightPads& in_right_pads,
+                       const FloatAB* __restrict__ p_a_grid,
+                       const FloatAB* __restrict__ p_b_grid,
+                       const FloatC* __restrict__ p_bias_grid,
+                       FloatC* __restrict__ p_c_grid,
+                       FloatC* __restrict__ p_d_grid,
+                       const int nrepeat) const
+    {
+        using namespace ck;
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N  = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
+        const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
+        const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
+        const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
+        // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
+
+        const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto Hx = max_n_k0_hx_wx_k1_global_desc.GetLength(I2);
+        const auto Wx = max_n_k0_hx_wx_k1_global_desc.GetLength(I3);
+
+        const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
+        const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
+        const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
+
+        const auto ConvStrideH = conv_strides[I0];
+        const auto ConvStrideW = conv_strides[I1];
+
+        const auto ConvDilationH = conv_dilations[I0];
+        const auto ConvDilationW = conv_dilations[I1];
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto OutRightPadHx = Number<OutRightPadH / 2>{};
+        const auto OutRightPadWx = Number<OutRightPadW / 2>{};
+#else
+        const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
+        const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
+
+        const auto OutRightPadH = Hop - Ho;
+        const auto OutRightPadW = Wop - Wo;
+
+        const auto OutRightPadHx = OutRightPadH / 2;
+        const auto OutRightPadWx = OutRightPadW / 2;
+#endif
+
+        const auto InLeftPadH = in_left_pads[I0];
+        const auto InLeftPadW = in_left_pads[I1];
+
+        const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        const auto E = C0 * Y * X;
+
+        constexpr auto E1 = Number<E1_>{};
+        constexpr auto E2 = Number<E2_>{};
+        constexpr auto K2 = Number<K2_>{};
+
+        const auto E0 = E / E1;
+
+        // weight tensor
+        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
+            make_tuple(make_pass_through_transform(K),
+                       make_pass_through_transform(C0 * Y * X),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
+
+        const auto a_e0_e1_k_e2_grid_desc =
+            transform_tensor_descriptor(a_e_k_e2_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                                                   make_pass_through_transform(K),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // input tensor
+        const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C0),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
+            in_n_c0_hip_wip_e2_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C0),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
+                make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
+
+        const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_n_c0_y_ho_x_wo_e2_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(
+                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+            in_e_n_ho_wo_e2_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
+
+        // output tensor
+        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, I0, OutRightPadH),
+                       make_pad_transform(Wo, I0, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // max tensor
+        const auto d_k_n_hx_wx_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Hx, I0, OutRightPadHx),
+                       make_pad_transform(Wx, I0, OutRightPadWx)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E1 % E1PerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // clang-format off
+
+        // hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
+        constexpr auto a_e0_e1_k_e2_global_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = 
+            make_tuple(
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), 
+                make_tuple(
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, 
+                    Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
+            ); 
+
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+
+        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        // clang-format on
+
+        // GEMM
+        using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(a_e0_e1_k_e2_grid_desc),
+            decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
+            decltype(c_k_n_hop_wop_grid_desc),
+            decltype(d_k_n_hx_wx_grid_desc),
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            Sequence<2, 3, 0, 1, 4>,
+            Sequence<0, 1, 2, 3, 4>,
+            4,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
+            9,
+            BThreadTransferSrcScalarPerVector_E2,
+            false, // don't move back src coordinate after threadwise copy, which will be fused
+                   // with MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
+            1,
+            CThreadTransferDstScalarPerVector_K,
+            decltype(a_e0_e1_k_e2_global_step_hacks),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks),
+            decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
+
+        const auto a_e0_e1_k0_k1_e2_grid_desc =
+            GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
+        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+            GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
+        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
+            GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
+        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
+            GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
+
+        using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
+        using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+        using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+
+        const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        const bool has_main_e0_block_loop = E0 > 1;
+
+        std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
+
+        const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
+            GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
+
+        using CBlockIdToBlockClusterAdaptor_K_N_H_W =
+            decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        float ave_time = 0;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+
+        if(has_main_e0_block_loop)
+        {
+            const auto kernel = kernel_gemm_dlops_v3_maxpool<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                true,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid,
+                                              p_d_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+        else
+        {
+            const auto kernel = kernel_gemm_dlops_v3_maxpool<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                false,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid,
+                                              p_d_grid,
+                                              a_e0_e1_k0_k1_e2_grid_desc,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                              d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                              c_blockid_to_k_n_h_w_block_cluster_adaptor);
+        }
+
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+        DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
+        DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
+            sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
+        DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
+            sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
+        DeviceMem d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf(
+            sizeof(DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx));
+        DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
+            sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
+
+        a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
+        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
+            &b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
+        c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
+            &c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+        d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.ToDevice(
+            &d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+        c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
+            &c_blockid_to_k_n_h_w_block_cluster_adaptor);
+
+        if(has_main_e0_block_loop)
+        {
+
+            const auto kernel = kernel_gemm_dlops_v3_maxpool<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                true,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_c_grid,
+                p_d_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+        else
+        {
+
+            const auto kernel = kernel_gemm_dlops_v3_maxpool<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                false,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(
+                kernel,
+                nrepeat,
+                dim3(grid_size),
+                dim3(BlockSize),
+                0,
+                p_a_grid,
+                p_b_grid,
+                p_bias_grid,
+                p_c_grid,
+                p_d_grid,
+                cast_pointer_to_constant_address_space(
+                    a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
+                cast_pointer_to_constant_address_space(
+                    c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
+        }
+#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        {
+            static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
+            static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
+
+            const auto kernel = kernel_gemm_dlops_v3_maxpool<
+                GridwiseGemm,
+                FloatAB,
+                FloatC,
+                remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
+                remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
+                remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
+                remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
+                remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
+                has_main_e0_block_loop,
+                activ_type>;
+
+            ave_time = launch_and_time_kernel(kernel,
+                                              nrepeat,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              p_a_grid,
+                                              p_b_grid,
+                                              p_bias_grid,
+                                              p_c_grid,
+                                              p_d_grid);
+        }
+#endif
+        return ave_time;
+    }
+};
+#endif
--- a/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "debug.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "device_tensor.hpp"
+#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHWC 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHWC // 0
+};
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
+                                       const Tensor<TWei>& wei,
+                                       const Tensor<TOut>& add,
+                                       const Tensor<TOut>& bias,
+                                       Tensor<TOut>& add_host,
+                                       Tensor<TOut>& out_host,
+                                       const ConvStrides& conv_strides,
+                                       const ConvDilations& conv_dilations,
+                                       const InLeftPads& in_left_pads,
+                                       const InRightPads&,
+                                       const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+
+        v += bias(k0, k1);
+        v = activ(v, activ_type);
+
+        const int hox2 = ho * 2;
+        const int wox2 = wo * 2;
+
+        out_host(n, k0, ho, wo, k1) = v;
+
+        add_host(n, k0, hox2, wox2, k1)         = v + add(n, k0, hox2, wox2, k1);
+        add_host(n, k0, hox2, wox2 + 1, k1)     = v + add(n, k0, hox2, wox2 + 1, k1);
+        add_host(n, k0, hox2 + 1, wox2, k1)     = v + add(n, k0, hox2 + 1, wox2, k1);
+        add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
+    };
+
+    make_ParallelTensorFunctor(f_nchw,
+                               out_host.mDesc.GetLengths()[0],
+                               out_host.mDesc.GetLengths()[1],
+                               out_host.mDesc.GetLengths()[2],
+                               out_host.mDesc.GetLengths()[3],
+                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 23)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    const index_t N  = std::stoi(argv[6]);
+    const index_t K0 = std::stoi(argv[7]);
+    const index_t K1 = std::stoi(argv[8]);
+    const index_t C0 = std::stoi(argv[9]);
+    const index_t C1 = std::stoi(argv[10]);
+    const index_t Y  = std::stoi(argv[11]);
+    const index_t X  = std::stoi(argv[12]);
+    const index_t Hi = std::stoi(argv[13]);
+    const index_t Wi = std::stoi(argv[14]);
+
+    const index_t conv_stride_h   = std::stoi(argv[15]);
+    const index_t conv_stride_w   = std::stoi(argv[16]);
+    const index_t conv_dilation_h = std::stoi(argv[17]);
+    const index_t conv_dilation_w = std::stoi(argv[18]);
+    const index_t in_left_pad_h   = std::stoi(argv[19]);
+    const index_t in_left_pad_w   = std::stoi(argv[20]);
+    const index_t in_right_pad_h  = std::stoi(argv[21]);
+    const index_t in_right_pad_w  = std::stoi(argv[22]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const auto Hox2 = Ho * 2;
+    const auto Wox2 = Wo * 2;
+#else
+    // static mode
+    if(argc < 6)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+#if 0
+    constexpr auto N             = Number<1>{};
+    constexpr auto Hi            = Number<1080>{};
+    constexpr auto Wi            = Number<1920>{};
+    constexpr auto Y             = Number<3>{};
+    constexpr auto X             = Number<3>{};
+    constexpr auto C0            = Number<2>{};
+    constexpr auto C1            = Number<8>{};
+    constexpr auto K1            = Number<8>{};
+    constexpr auto K0            = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<540>{};
+    constexpr auto Wi = Number<960>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<270>{};
+    constexpr auto Wi = Number<480>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 1
+    constexpr auto N  = Number<128>{};
+    constexpr auto Hi = Number<135>{};
+    constexpr auto Wi = Number<240>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 1
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<32>{};
+    constexpr auto Wi = Number<32>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K1 = Number<8>{};
+    constexpr auto K0 = Number<8>{};
+#endif
+
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
+
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
+
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
+
+    constexpr auto Hox2 = Number<Ho * 2>{};
+    constexpr auto Wox2 = Number<Wo * 2>{};
+
+#endif
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t     = half_t;
+    using acc_data_t    = float;
+    using out_data_t    = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
+        add_lengths_host(5), bias_lengths_host(2);
+
+    in_lengths_host[0] = static_cast<std::size_t>(N);
+    in_lengths_host[1] = static_cast<std::size_t>(C0);
+    in_lengths_host[2] = static_cast<std::size_t>(Hi);
+    in_lengths_host[3] = static_cast<std::size_t>(Wi);
+    in_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
+    wei_lengths_host[1] = static_cast<std::size_t>(C0);
+    wei_lengths_host[2] = static_cast<std::size_t>(Y);
+    wei_lengths_host[3] = static_cast<std::size_t>(X);
+    wei_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    out_lengths_host[0] = static_cast<std::size_t>(N);
+    out_lengths_host[1] = static_cast<std::size_t>(K0);
+    out_lengths_host[2] = static_cast<std::size_t>(Ho);
+    out_lengths_host[3] = static_cast<std::size_t>(Wo);
+    out_lengths_host[4] = static_cast<std::size_t>(K1);
+
+    add_lengths_host[0] = static_cast<std::size_t>(N);
+    add_lengths_host[1] = static_cast<std::size_t>(K0);
+    add_lengths_host[2] = static_cast<std::size_t>(Hox2);
+    add_lengths_host[3] = static_cast<std::size_t>(Wox2);
+    add_lengths_host[4] = static_cast<std::size_t>(K1);
+
+    bias_lengths_host[0] = static_cast<std::size_t>(K0);
+    bias_lengths_host[1] = static_cast<std::size_t>(K1);
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<in_data_t> add(add_lengths_host);
+    Tensor<in_data_t> add_device(add_lengths_host);
+    Tensor<in_data_t> add_host(add_lengths_host);
+    Tensor<out_data_t> bias(bias_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
+
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+    add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+
+    auto f_make_for_device_nchwc = [&]() {
+        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
+        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
+        const auto add_lengths_dev    = make_tuple(N, K0, Hox2, Wox2, K1);
+        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          add_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+#if USE_CONV_FWD_V5R1_NCHWC
+    if(algo == ConvForwardAlgo::V5R1NCHWC)
+    {
+        const auto tmp = f_make_for_device_nchwc();
+
+        device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
+                                                                                        acc_data_t,
+                                                                                        out_data_t,
+                                                                                        activ_type>(
+            tmp[I0], // in_lengths_dev
+            tmp[I1], // wei_lengths_dev
+            tmp[I2], // add_lengths_dev
+            tmp[I3], // out_lengths_dev
+            tmp[I4], // conv_strides_dev
+            tmp[I5], // conv_dilations_dev
+            tmp[I6], // in_left_pads_dev
+            tmp[I7], // in_right_pads_dev
+            in,
+            wei,
+            bias,
+            add,
+            add_device,
+            nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_add_nchwc(in,
+                                          wei,
+                                          add,
+                                          bias,
+                                          add_host,
+                                          out_host,
+                                          make_tuple(conv_stride_h, conv_stride_w),
+                                          make_tuple(conv_dilation_h, conv_dilation_w),
+                                          make_tuple(in_left_pad_h, in_left_pad_w),
+                                          make_tuple(in_right_pad_h, in_right_pad_w),
+                                          activ_type);
+
+        check_error(add_host, add_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
+        }
+    }
+}
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -15,17 +15,15 @@
 #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
 #include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"

-#define USE_DYNAMIC_MODE 1
+#define USE_DYNAMIC_MODE 0
 #define USE_CONV_FWD_V4R4_NCHW 0
-#define USE_CONV_FWD_V4R4R2_NHWC 0
-#define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NCHW 0
+#define USE_CONV_FWD_V4R4R2_NHWC 1
+#define USE_CONV_FWD_V6R1_NCHW 1
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
-#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0

 enum ConvTensorLayout
 {
@@ -41,9 +39,8 @@ enum ConvForwardAlgo
    V4R4NCHW,      // 0
    V4R4R2NHWC,    // 1
    V6R1NCHW,      // 2
-    V5R1NCHW,      // 3
-    V4R4R2XDLNCHW, // 4
-    V4R4R4XDLNHWC  // 5
+    V4R4R2XDLNCHW, // 3
+    V4R4R4XDLNHWC  // 4
 };

 template <typename TIn,
@@ -237,8 +234,8 @@ int main(int argc, char* argv[])
    constexpr auto Y  = Number<3>{};
    constexpr auto X  = Number<3>{};

-    constexpr auto conv_stride_h   = I2;
-    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
    constexpr auto conv_dilation_h = I1;
    constexpr auto conv_dilation_w = I1;
    constexpr auto in_left_pad_h   = I1;
@@ -253,7 +250,7 @@ int main(int argc, char* argv[])
    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
 #endif

-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
@@ -472,33 +469,6 @@ int main(int argc, char* argv[])
    }
 #endif

-#if USE_CONV_FWD_V5R1_NCHW
-    if(algo == ConvForwardAlgo::V5R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-
-        const auto tmp = f_make_for_device_nchw();
-
-        device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                           16,
-                                                                           acc_data_t,
-                                                                           out_data_t>(tmp[I0],
-                                                                                       tmp[I1],
-                                                                                       tmp[I2],
-                                                                                       tmp[I3],
-                                                                                       tmp[I4],
-                                                                                       tmp[I5],
-                                                                                       tmp[I6],
-                                                                                       in,
-                                                                                       wei,
-                                                                                       out_device,
-                                                                                       nrepeat);
-    }
-#endif
-
 #if USE_CONV_FWD_V4R4R2_XDL_NCHW
    if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
    {

--- a/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "debug.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "device_tensor.hpp"
+#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHWC 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHWC // 0
+};
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_nchwc(const Tensor<TIn>& in,
+                                   const Tensor<TWei>& wei,
+                                   const Tensor<TOut>& bias,
+                                   Tensor<TOut>& out,
+                                   const ConvStrides& conv_strides,
+                                   const ConvDilations& conv_dilations,
+                                   const InLeftPads& in_left_pads,
+                                   const InRightPads&,
+                                   const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v    = 0;
+        const int k = k0 * out.mDesc.GetLengths()[4] + k1;
+
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+        v += bias(k0, k1);
+        out(n, k0, ho, wo, k1) = activ(v, activ_type);
+    };
+
+    make_ParallelTensorFunctor(f_nchw,
+                               out.mDesc.GetLengths()[0],
+                               out.mDesc.GetLengths()[1],
+                               out.mDesc.GetLengths()[2],
+                               out.mDesc.GetLengths()[3],
+                               out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 23)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    const index_t N  = std::stoi(argv[6]);
+    const index_t K0 = std::stoi(argv[7]);
+    const index_t K1 = std::stoi(argv[8]);
+    const index_t C0 = std::stoi(argv[9]);
+    const index_t C1 = std::stoi(argv[10]);
+    const index_t Y  = std::stoi(argv[11]);
+    const index_t X  = std::stoi(argv[12]);
+    const index_t Hi = std::stoi(argv[13]);
+    const index_t Wi = std::stoi(argv[14]);
+
+    const index_t conv_stride_h   = std::stoi(argv[15]);
+    const index_t conv_stride_w   = std::stoi(argv[16]);
+    const index_t conv_dilation_h = std::stoi(argv[17]);
+    const index_t conv_dilation_w = std::stoi(argv[18]);
+    const index_t in_left_pad_h   = std::stoi(argv[19]);
+    const index_t in_left_pad_w   = std::stoi(argv[20]);
+    const index_t in_right_pad_h  = std::stoi(argv[21]);
+    const index_t in_right_pad_w  = std::stoi(argv[22]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#else
+    // static mode
+    if(argc < 6)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    // constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+#if 0
+    constexpr auto N              = Number<1>{};
+    constexpr auto Hi             = Number<1080>{};
+    constexpr auto Wi             = Number<1920>{};
+    constexpr auto Y              = Number<3>{};
+    constexpr auto X              = Number<3>{};
+    constexpr auto C0             = Number<2>{};
+    constexpr auto C1             = Number<8>{};
+    constexpr auto K0             = Number<1>{};
+    constexpr auto K1             = Number<4>{};
+#elif 1
+    constexpr auto N              = Number<1>{};
+    constexpr auto Hi             = Number<1080>{};
+    constexpr auto Wi             = Number<1920>{};
+    constexpr auto Y              = Number<3>{};
+    constexpr auto X              = Number<3>{};
+    constexpr auto C0             = Number<2>{};
+    constexpr auto C1             = Number<8>{};
+    constexpr auto K0             = Number<2>{};
+    constexpr auto K1             = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<1080>{};
+    constexpr auto Wi = Number<1920>{};
+    constexpr auto Y  = Number<1>{};
+    constexpr auto X  = Number<1>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<540>{};
+    constexpr auto Wi = Number<960>{};
+    constexpr auto Y  = Number<1>{};
+    constexpr auto X  = Number<1>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<128>{};
+    constexpr auto Hi = Number<270>{};
+    constexpr auto Wi = Number<480>{};
+    constexpr auto Y  = Number<1>{};
+    constexpr auto X  = Number<1>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#endif
+
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+
+#if 1
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
+#else
+    constexpr auto in_left_pad_h  = I0;
+    constexpr auto in_left_pad_w  = I0;
+    constexpr auto in_right_pad_h = I0;
+    constexpr auto in_right_pad_w = I0;
+#endif
+
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
+
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
+#endif
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t   = half_t;
+    using acc_data_t  = float;
+    using out_data_t  = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
+        bias_lengths_host(2);
+
+    in_lengths_host[0] = static_cast<std::size_t>(N);
+    in_lengths_host[1] = static_cast<std::size_t>(C0);
+    in_lengths_host[2] = static_cast<std::size_t>(Hi);
+    in_lengths_host[3] = static_cast<std::size_t>(Wi);
+    in_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
+    wei_lengths_host[1] = static_cast<std::size_t>(C0);
+    wei_lengths_host[2] = static_cast<std::size_t>(Y);
+    wei_lengths_host[3] = static_cast<std::size_t>(X);
+    wei_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    out_lengths_host[0] = static_cast<std::size_t>(N);
+    out_lengths_host[1] = static_cast<std::size_t>(K0);
+    out_lengths_host[2] = static_cast<std::size_t>(Ho);
+    out_lengths_host[3] = static_cast<std::size_t>(Wo);
+    out_lengths_host[4] = static_cast<std::size_t>(K1);
+
+    bias_lengths_host[0] = static_cast<std::size_t>(K0);
+    bias_lengths_host[1] = static_cast<std::size_t>(K1);
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> bias(bias_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchwc = [&]() {
+        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
+        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
+        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+#if USE_CONV_FWD_V5R1_NCHWC
+    if(algo == ConvForwardAlgo::V5R1NCHWC)
+    {
+        const auto tmp = f_make_for_device_nchwc();
+
+        device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
+                                                                                    acc_data_t,
+                                                                                    out_data_t,
+                                                                                    activ_type>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            bias,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_nchwc(in,
+                                      wei,
+                                      bias,
+                                      out_host,
+                                      make_tuple(conv_stride_h, conv_stride_w),
+                                      make_tuple(conv_dilation_h, conv_dilation_w),
+                                      make_tuple(in_left_pad_h, in_left_pad_w),
+                                      make_tuple(in_right_pad_h, in_right_pad_w),
+                                      activ_type);
+
+        check_error(out_host, out_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+    }
+}
--- a/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/host/driver_offline/src/conv_maxpool_fwd_driver_offline_nchwc.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "debug.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "device_tensor.hpp"
+#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHWC 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHWC // 0
+};
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& bias,
+                                           Tensor<TOut>& out_host,
+                                           Tensor<TOut>& max_host,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads&,
+                                           const ck::ActivTypeEnum_t activ_type)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        double v = 0;
+        auto k   = k0 * out_host.mDesc.GetLengths()[4] + k1;
+
+        for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
+                        {
+                            v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
+                                 static_cast<const double>(wei(k, c0, y, x, c1));
+                        }
+                    }
+                }
+            }
+        }
+
+        v += bias(k0, k1);
+        v = activ(v, activ_type);
+
+        out_host(n, k0, ho, wo, k1) = v;
+    };
+
+    make_ParallelTensorFunctor(f_nchw,
+                               out_host.mDesc.GetLengths()[0],
+                               out_host.mDesc.GetLengths()[1],
+                               out_host.mDesc.GetLengths()[2],
+                               out_host.mDesc.GetLengths()[3],
+                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+
+    auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        auto hx = ho * 2;
+        auto wx = wo * 2;
+
+        auto v0 = out_host(n, k0, hx, wx, k1);
+        auto v1 = out_host(n, k0, hx, wx + 1, k1);
+        auto v2 = out_host(n, k0, hx + 1, wx, k1);
+        auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
+
+        max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
+    };
+
+    make_ParallelTensorFunctor(maxpool_nchw,
+                               max_host.mDesc.GetLengths()[0],
+                               max_host.mDesc.GetLengths()[1],
+                               max_host.mDesc.GetLengths()[2],
+                               max_host.mDesc.GetLengths()[3],
+                               max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 23)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    const index_t N  = std::stoi(argv[6]);
+    const index_t K0 = std::stoi(argv[7]);
+    const index_t K1 = std::stoi(argv[8]);
+    const index_t C0 = std::stoi(argv[9]);
+    const index_t C1 = std::stoi(argv[10]);
+    const index_t Y  = std::stoi(argv[11]);
+    const index_t X  = std::stoi(argv[12]);
+    const index_t Hi = std::stoi(argv[13]);
+    const index_t Wi = std::stoi(argv[14]);
+
+    const index_t conv_stride_h   = std::stoi(argv[15]);
+    const index_t conv_stride_w   = std::stoi(argv[16]);
+    const index_t conv_dilation_h = std::stoi(argv[17]);
+    const index_t conv_dilation_w = std::stoi(argv[18]);
+    const index_t in_left_pad_h   = std::stoi(argv[19]);
+    const index_t in_left_pad_w   = std::stoi(argv[20]);
+    const index_t in_right_pad_h  = std::stoi(argv[21]);
+    const index_t in_right_pad_w  = std::stoi(argv[22]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const index_t Ho_2 = Ho / 2;
+    const index_t Wo_2 = Wo / 2;
+#else
+    // static mode
+    if(argc < 6)
+    {
+        printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
+
+    const bool do_verification = std::stoi(argv[2]);
+    const int init_method      = std::stoi(argv[3]);
+    const bool do_log          = std::stoi(argv[4]);
+    const int nrepeat          = std::stoi(argv[5]);
+
+    constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
+
+#if 1
+    constexpr auto N                         = Number<1>{};
+    constexpr auto Hi                        = Number<1080>{};
+    constexpr auto Wi                        = Number<1920>{};
+    constexpr auto Y                         = Number<3>{};
+    constexpr auto X                         = Number<3>{};
+    constexpr auto C0                        = Number<2>{};
+    constexpr auto C1                        = Number<8>{};
+    constexpr auto K0                        = Number<2>{};
+    constexpr auto K1                        = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<1080>{};
+    constexpr auto Wi = Number<1920>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<3>{};
+    constexpr auto C1 = Number<4>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<1>{};
+    constexpr auto Hi = Number<540>{};
+    constexpr auto Wi = Number<960>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#elif 0
+    constexpr auto N  = Number<128>{};
+    constexpr auto Hi = Number<270>{};
+    constexpr auto Wi = Number<480>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+    constexpr auto C0 = Number<2>{};
+    constexpr auto C1 = Number<8>{};
+    constexpr auto K0 = Number<2>{};
+    constexpr auto K1 = Number<8>{};
+#endif
+
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
+
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
+
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
+
+    constexpr auto Ho_2 = Number<Ho / 2>{};
+    constexpr auto Wo_2 = Number<Wo / 2>{};
+
+#endif
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t     = half_t;
+    using acc_data_t    = float;
+    using out_data_t    = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
+        max_lengths_host(5), bias_lengths_host(2);
+
+    in_lengths_host[0] = static_cast<std::size_t>(N);
+    in_lengths_host[1] = static_cast<std::size_t>(C0);
+    in_lengths_host[2] = static_cast<std::size_t>(Hi);
+    in_lengths_host[3] = static_cast<std::size_t>(Wi);
+    in_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
+    wei_lengths_host[1] = static_cast<std::size_t>(C0);
+    wei_lengths_host[2] = static_cast<std::size_t>(Y);
+    wei_lengths_host[3] = static_cast<std::size_t>(X);
+    wei_lengths_host[4] = static_cast<std::size_t>(C1);
+
+    out_lengths_host[0] = static_cast<std::size_t>(N);
+    out_lengths_host[1] = static_cast<std::size_t>(K0);
+    out_lengths_host[2] = static_cast<std::size_t>(Ho);
+    out_lengths_host[3] = static_cast<std::size_t>(Wo);
+    out_lengths_host[4] = static_cast<std::size_t>(K1);
+
+    max_lengths_host[0] = static_cast<std::size_t>(N);
+    max_lengths_host[1] = static_cast<std::size_t>(K0);
+    max_lengths_host[2] = static_cast<std::size_t>(Ho_2);
+    max_lengths_host[3] = static_cast<std::size_t>(Wo_2);
+    max_lengths_host[4] = static_cast<std::size_t>(K1);
+
+    bias_lengths_host[0] = static_cast<std::size_t>(K0);
+    bias_lengths_host[1] = static_cast<std::size_t>(K1);
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> bias(bias_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<in_data_t> max_device(max_lengths_host);
+    Tensor<in_data_t> max_host(max_lengths_host);
+
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+
+    auto f_make_for_device_nchwc = [&]() {
+        const auto in_lengths_dev     = make_tuple(N, C0, Hi, Wi, C1);
+        const auto wei_lengths_dev    = make_tuple(K0 * K1, C0, Y, X, C1);
+        const auto max_lengths_dev    = make_tuple(N, K0, Ho_2, Wo_2, K1);
+        const auto out_lengths_dev    = make_tuple(N, K0, Ho, Wo, K1);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          max_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+#if USE_CONV_FWD_V5R1_NCHWC
+    if(algo == ConvForwardAlgo::V5R1NCHWC)
+    {
+        const auto tmp = f_make_for_device_nchwc();
+
+        device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<
+            in_data_t,
+            acc_data_t,
+            out_data_t,
+            activ_type>(tmp[I0], // in_lengths_dev
+                        tmp[I1], // wei_lengths_dev
+                        tmp[I2], // max_lengths_dev
+                        tmp[I3], // out_lengths_dev
+                        tmp[I4], // conv_strides_dev
+                        tmp[I5], // conv_dilations_dev
+                        tmp[I6], // in_left_pads_dev
+                        tmp[I7], // in_right_pads_dev
+                        in,
+                        wei,
+                        bias,
+                        out_device,
+                        max_device,
+                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_maxpool_nchwc(in,
+                                              wei,
+                                              bias,
+                                              out_host,
+                                              max_host,
+                                              make_tuple(conv_stride_h, conv_stride_w),
+                                              make_tuple(conv_dilation_h, conv_dilation_w),
+                                              make_tuple(in_left_pad_h, in_left_pad_w),
+                                              make_tuple(in_right_pad_h, in_right_pad_w),
+                                              activ_type);
+
+        check_error(out_host, out_device);
+        check_error(max_host, max_device);
+
+        if(do_log)
+        {
+            // LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            // LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            // LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
+            // std::endl;
+            LogRangeAsType<float>(std::cout << "max_host: ", max_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "max_device: ", max_device.mData, ",") << std::endl;
+        }
+    }
+}