init commit of conv+add

5ed51b71 · Jing Zhang · b53926e9 · 5ed51b71 · 5ed51b71 · 5ed51b71
Commit 5ed51b71 authored Jul 26, 2021 by Jing Zhang
11 changed files
--- a/composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+#ifndef CK_DRIVER_STATIC_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define CK_DRIVER_STATIC_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_static_gemm_v3.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Add,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads,
+              index_t activ_type>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      Number<activ_type>,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      const FloatC* __restrict__ p_add_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
+
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto K1 = Number<K1_>{};
+
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+
+        constexpr auto Hox2 = Number<Ho * 2>{};
+        constexpr auto Wox2 = Number<Wo * 2>{};
+
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+
+        constexpr auto in_right_pads = InRightPads{};
+
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_n_c_hip_wip_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_n_c_hip_wip_global_desc need to known at compile-time");
+
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        static_assert(in_n_c_y_ho_x_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_n_c_y_ho_x_wo_global_desc need to known at compile-time");
+
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_right_pad_transform(Ho, OutRightPadH),
+                       make_right_pad_transform(Wo, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+
+        // add tensor
+        const auto add_k_n_hopx2_wopx2_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2)),
+            make_tuple(make_pass_through_transform(K0),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hox2),
+                       make_pass_through_transform(Wox2)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(add_k_n_hopx2_wopx2_global_desc.IsKnownAtCompileTime(),
+                      "wrong! add_k_n_hopx2_wopx2_global_desc need to known at compile-time");
+
+        const auto E = C * Y * X;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(add_k_n_hopx2_wopx2_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;
+
+        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               const FloatC*,
+                                                               FloatC*,
+                                                               Number<activ_type>,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_add_global,
+                                  p_out_global,
+                                  Number<activ_type>{},
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               const FloatC*,
+                                                               FloatC*,
+                                                               Number<activ_type>,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_add_global,
+                                  p_out_global,
+                                  Number<activ_type>{},
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               const FloatC*,
+                                                               FloatC*,
+                                                               Number<activ_type>,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_add_global,
+                                  p_out_global,
+                                  Number<activ_type>{},
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               const FloatC*,
+                                                               FloatC*,
+                                                               Number<activ_type>,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_add_global,
+                                  p_out_global,
+                                  Number<activ_type>{},
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+
+#if 0
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad_1x1
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
+
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto K1 = Number<K1_>{};
+
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+
+        static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
+
+        constexpr auto in_right_pads = InRightPads{};
+
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+
+        // input tensor
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(C),
+                       make_pass_through_transform(N),
+                       make_right_pad_transform(Hi, InRightPadH),
+                       make_right_pad_transform(Wi, InRightPadW)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_right_pad_transform(Ho, OutRightPadH),
+                       make_right_pad_transform(Wo, OutRightPadW)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+
+        const auto E = C;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
+                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
+                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
+                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
+                  << std::endl;
+
+        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+
+        const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
+
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+        const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
+        const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
+        const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
+
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto K1 = Number<K1_>{};
+
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+
+        static_assert(InLeftPadH == 0 and InLeftPadW == 0, "");
+
+        constexpr auto in_right_pads = InRightPads{};
+
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+
+        static_assert(OutRightPadW == 0 and OutRightPadH == 0, "");
+
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+
+        // input tensor
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(C),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
+            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo)),
+            make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+
+        const auto E = C;
+
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0>{};
+
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+
+        index_t nrepeat = 100;
+
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;
+
+        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;
+
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+
+            timer.End();
+
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_k1_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+};
+#endif
+} // namespace ck
+#endif
--- a/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
@@ -226,7 +226,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
                                  Sequence<0, 0, 0, 0, 0>{}));

        // GEMM
-        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v2<
            BlockSize,
            FloatAB,
            FloatAcc,
@@ -273,11 +273,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad

        index_t nrepeat = 100;

-        std::cout << "conv_v5r1__NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
-                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
-                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
-                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
-                  << std::endl;
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;

        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;

@@ -990,11 +989,10 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_1x1

        index_t nrepeat = 100;

-        std::cout << "conv_v5r1_NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi
-                  << "-k" << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0]
-                  << "v" << conv_strides[I1] << "l" << conv_dilations[I0] << "j"
-                  << conv_dilations[I1] << "q" << in_left_pads[I0] << "p" << in_right_pads[I0]
-                  << std::endl;
+        std::cout << "NCHWc" << K1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" << K
+                  << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+                  << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1]
+                  << "q" << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;

        std::cout << "GridSize = " << GridSize << " BlockSize = " << BlockSize << std::endl;


--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
@@ -47,7 +47,7 @@ template <index_t BlockSize,
          typename CGlobalIteratorHacks,
          typename AGlobalMoveSliceWindowIteratorHacks,
          typename BGlobalMoveSliceWindowIteratorHacks>
-struct GridwiseStaticGemm_km_kn_mn_v3
+struct GridwiseStaticGemm_km_kn_mn_v2
 {
    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
@@ -237,10 +237,12 @@ struct GridwiseStaticGemm_km_kn_mn_v3
            c_thread_buf;

        // initialize output thread tensor
+#if 0
        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
-                                           decltype(c_k_n_ho_wo_thread_desc),
-                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
-            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+            decltype(c_k_n_ho_wo_thread_desc),
+            Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+        .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+#endif

        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);


--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v3.hpp
+#ifndef CK_GRIDWISE_STATIC_GEMM_V3_HPP
+#define CK_GRIDWISE_STATIC_GEMM_V3_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "blockwise_gemm_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename DGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseStaticGemm_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        constexpr auto a_e_k_global_desc = AGlobalDesc{};
+
+        constexpr auto E = a_e_k_global_desc.GetLength(I0);
+
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    template <index_t activ_type, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const FloatAB* __restrict__ p_a_global,
+                        const FloatAB* __restrict__ p_b_global,
+                        const FloatC* __restrict__ p_d_global,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        Number<activ_type>,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto a_e_k_global_desc           = AGlobalDesc{};
+        constexpr auto b_e_n_ho_wo_global_desc     = BGlobalDesc{};
+        constexpr auto d_k_n_hox2_wox2_global_desc = DGlobalDesc{};
+        constexpr auto c_k_n_ho_wo_global_desc     = CGlobalDesc{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_d_global, d_k_n_hox2_wox2_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+
+        constexpr auto E = a_e_k_global_desc.GetLength(I0);
+        constexpr auto K = a_e_k_global_desc.GetLength(I1);
+
+        constexpr auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        constexpr auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        constexpr auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+
+        // divide block work by [M, N]
+#if 0
+        const auto k_block_work_num   = K / Number<KPerBlock>{};
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto blockwise_gemm = BlockwiseGemm_km_kn_m0m1n0n1_v3<BlockSize,
+                                                              FloatAB,
+                                                              FloatAB,
+                                                              FloatAcc,
+                                                              decltype(a_e_k_block_desc),
+                                                              decltype(b_e_n_ho_wo_block_desc),
+                                                              decltype(c_k_n_ho_wo_thread_desc),
+                                                              KPerThread,
+                                                              HoPerThread,
+                                                              WoPerThread,
+                                                              EPerThread,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              ABlockTransferDstScalarPerVector_K>{};
+
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<E, KPerBlock>,
+                                                   ABlockTransferThreadSliceLengths_E_K,
+                                                   ABlockTransferThreadClusterLengths_E_K,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_e_k_global_desc),
+                                                   decltype(a_e_k_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   1,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_e_k_global_desc,
+                make_multi_index(0, k_block_data_on_global),
+                a_e_k_desc,
+                make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e_n_ho_wo_global_desc),
+            decltype(b_e_n_ho_wo_thread_desc),
+            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            1,
+            true>(b_e_n_ho_wo_global_desc,
+                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpace::Lds>(p_shared_block,
+                                                                  a_e_k_desc.GetElementSpaceSize());
+
+        // register allocation for output
+        StaticBuffer<AddressSpace::Vgpr, FloatAcc, c_k_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // initialize output thread tensor
+#if 0
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+            decltype(c_k_n_ho_wo_thread_desc),
+            Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+        .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+#endif
+
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpace::Vgpr, FloatAB, b_e_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        // LDS double buffer: preload data
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_iterator_hacks);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
+        }
+
+        __syncthreads();
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                e_block_data_begin += 2 * EPerBlock;
+
+            } while(e_block_data_begin < E - 2 * EPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+
+        // activ
+        {
+            static_for<0, c_k_n_ho_wo_thread_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
+                if constexpr(activ_type == 1)
+                    c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : 0.0;
+                else if constexpr(activ_type == 2)
+                    c_thread_buf(i) = 1.0 / (1.0 + exp(-c_thread_buf[i]));
+            });
+        }
+
+        constexpr auto HoPerThreadx2 = HoPerThread * 2;
+        constexpr auto WoPerThreadx2 = WoPerThread * 2;
+
+        constexpr auto d_k_n_hox2_wox2_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number<KPerThread>{},
+                                                                      Number<1>{},
+                                                                      Number<HoPerThreadx2>{},
+                                                                      Number<WoPerThreadx2>{}));
+
+        StaticBuffer<AddressSpace::Vgpr, FloatC, d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize()>
+            d_thread_buf;
+
+        const index_t hox2_thread_data_on_global = ho_thread_data_on_global * 2;
+        const index_t wox2_thread_data_on_global = wo_thread_data_on_global * 2;
+
+        // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+
+        const index_t k_thread_data_on_global = k_block_data_on_global + k_thread_id * KPerThread;
+
+        // Resize_Add
+        {
+            ThreadwiseDynamicTensorSliceTransfer_v2<
+                FloatC,
+                FloatC,
+                decltype(d_k_n_hox2_wox2_global_desc),
+                decltype(d_k_n_hox2_wox2_thread_desc),
+                Sequence<KPerThread, 1, HoPerThreadx2, WoPerThreadx2>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                1,
+                true>(d_k_n_hox2_wox2_global_desc,
+                      make_multi_index(k_thread_data_on_global,
+                                       0,
+                                       hox2_thread_data_on_global,
+                                       wox2_thread_data_on_global))
+                .Run(d_k_n_hox2_wox2_global_desc,
+                     d_global_buf,
+                     d_k_n_hox2_wox2_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     d_thread_buf,
+                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+
+            static_for<0, KPerThread, 1>{}([&](auto k_i) {
+                static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
+                    static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) {
+                        d_thread_buf(Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(
+                                         make_tuple(k_i, 0, h_i, w_i))>{}) +=
+                            c_thread_buf[Number<c_k_n_ho_wo_thread_desc.CalculateOffset(
+                                make_tuple(k_i, 0, h_i / 2, w_i / 2))>{}];
+                    });
+                });
+            });
+        }
+
+        // output: register to global memory
+        {
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatC,
+                FloatC,
+                decltype(d_k_n_hox2_wox2_thread_desc),
+                decltype(d_k_n_hox2_wox2_global_desc),
+                Sequence<KPerThread, 1, HoPerThreadx2, WoPerThreadx2>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>(d_k_n_hox2_wox2_global_desc,
+                      make_multi_index(k_thread_data_on_global,
+                                       0,
+                                       hox2_thread_data_on_global,
+                                       wox2_thread_data_on_global))
+                .Run(d_k_n_hox2_wox2_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     d_thread_buf,
+                     d_k_n_hox2_wox2_global_desc,
+                     c_global_buf,
+                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+        }
+    }
+
+    // pass tensor descriptor by reference
+    template <index_t activ_type, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const FloatAB* __restrict__ p_a_global,
+                        const FloatAB* __restrict__ p_b_global,
+                        const FloatC* __restrict__ p_d_global,
+                        FloatC* __restrict__ p_c_global,
+                        Number<activ_type>,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(p_a_global,
+            p_b_global,
+            p_d_global,
+            p_c_global,
+            p_shared_block,
+            Number<activ_type>{},
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+} // namespace ck
+#endif
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -13,9 +13,15 @@ include_directories(BEFORE

 set(CONV_FWD_DRIVER_OFFLINE_SOURCE conv_fwd_driver_offline.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE conv_bwd_driver_offline.cpp)
+set(CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE conv_add_fwd_driver_offline.cpp)
+set(CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE conv_activ_fwd_driver_offline.cpp)

 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_add_fwd_driver_offline ${CONV_ADD_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_activ_fwd_driver_offline ${CONV_ACTIV_FWD_DRIVER_OFFLINE_SOURCE})

 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_add_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_activ_fwd_driver_offline PRIVATE host_tensor)
--- a/host/driver_offline/conv_activ_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_activ_fwd_driver_offline.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHW 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHW
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+#if 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 240;
+    constexpr index_t Wi = 135;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#endif
+
+    const index_t conv_stride_h   = 1;
+    const index_t conv_stride_w   = 1;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t      = half_t;
+    using acc_data_t     = float;
+    using out_data_t     = half_t;
+#elif 1
+    using in_data_t      = int8_t;
+    using acc_data_t     = int32_t;
+    using out_data_t     = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+    Tensor<out_data_t> add_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    constexpr ck::index_t activ_type = 2;
+
+#if USE_CONV_FWD_V5R1_NCHW
+    if(algo == ConvForwardAlgo::V5R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                            8,
+                                                                            8,
+                                                                            activ_type,
+                                                                            acc_data_t,
+                                                                            out_data_t>(tmp[I0],
+                                                                                        tmp[I1],
+                                                                                        tmp[I2],
+                                                                                        tmp[I3],
+                                                                                        tmp[I4],
+                                                                                        tmp[I5],
+                                                                                        tmp[I6],
+                                                                                        in,
+                                                                                        wei,
+                                                                                        out_device,
+                                                                                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_activ(in,
+                                      wei,
+                                      out_host,
+                                      make_tuple(conv_stride_h, conv_stride_w),
+                                      make_tuple(conv_dilation_h, conv_dilation_w),
+                                      make_tuple(in_left_pad_h, in_left_pad_w),
+                                      make_tuple(in_right_pad_h, in_right_pad_w),
+                                      activ_type,
+                                      layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+}
--- a/host/driver_offline/conv_add_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_add_fwd_driver_offline.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+
+#define USE_DYNAMIC_MODE 0
+#define USE_CONV_FWD_V5R1_NCHW 1
+
+enum ConvForwardAlgo
+{
+    V5R1NCHW
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+#if 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 1
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 240;
+    constexpr index_t Wi = 135;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 540;
+    constexpr index_t Wi = 960;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 480;
+    constexpr index_t Wi = 270;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 8;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t Hi = 1080;
+    constexpr index_t Wi = 1920;
+    constexpr index_t K  = 4;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+#endif
+
+    const index_t conv_stride_h   = 1;
+    const index_t conv_stride_w   = 1;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    const index_t Hox2 = Ho * 2;
+    const index_t Wox2 = Wo * 2;
+
+#if 0
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t      = half_t;
+    using acc_data_t     = float;
+    using out_data_t     = half_t;
+#elif 1
+    using in_data_t      = int8_t;
+    using acc_data_t     = int32_t;
+    using out_data_t     = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4),
+        add_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        add_lengths_host[0] = static_cast<std::size_t>(N);
+        add_lengths_host[1] = static_cast<std::size_t>(K);
+        add_lengths_host[2] = static_cast<std::size_t>(Hox2);
+        add_lengths_host[3] = static_cast<std::size_t>(Wox2);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        add_lengths_host[0] = static_cast<std::size_t>(N);
+        add_lengths_host[1] = static_cast<std::size_t>(Hox2);
+        add_lengths_host[2] = static_cast<std::size_t>(Wox2);
+        add_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<in_data_t> add(add_lengths_host);
+    Tensor<out_data_t> out_host(add_lengths_host);
+    Tensor<out_data_t> out_device(add_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto add_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Hox2>{}, Number<Wox2>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          add_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto add_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hox2>{}, Number<Wox2>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          add_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    constexpr ck::index_t activ_type = 2;
+
+#if USE_CONV_FWD_V5R1_NCHW
+    if(algo == ConvForwardAlgo::V5R1NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+#if 1
+        device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                                8,
+                                                                                8,
+                                                                                activ_type,
+                                                                                acc_data_t,
+                                                                                out_data_t>(
+            tmp[I0], // in_lengths_dev
+            tmp[I1], // wei_lengths_dev
+            tmp[I2], // add_lengths_dev
+            tmp[I3], // out_lengths_dev
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            tmp[I7],
+            in,
+            wei,
+            add,
+            out_device,
+            nrepeat);
+#endif
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_add(in,
+                                    wei,
+                                    add,
+                                    out_host,
+                                    make_tuple(conv_stride_h, conv_stride_w),
+                                    make_tuple(conv_dilation_h, conv_dilation_w),
+                                    make_tuple(in_left_pad_h, in_left_pad_w),
+                                    make_tuple(in_right_pad_h, in_right_pad_w),
+                                    activ_type,
+                                    layout);
+
+        check_error(out_host, out_device);
+
+#if 0
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+#endif
+    }
+}
--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -15,16 +15,15 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"

-#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_DYNAMIC_MODE 1
+#define USE_CONV_FWD_V4R4_NCHW 1
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NCHW 1
+#define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0

@@ -103,82 +102,16 @@ int main(int argc, char* argv[])
    const bool do_log             = atoi(argv[5]);
    const int nrepeat             = atoi(argv[6]);

-#if 1
-    constexpr index_t N           = 1;
-    constexpr index_t C           = 16;
-    constexpr index_t Hi          = 1080;
-    constexpr index_t Wi          = 1920;
-    constexpr index_t K           = 16;
-    constexpr index_t Y           = 3;
-    constexpr index_t X           = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 540;
-    constexpr index_t Wi = 960;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 480;
-    constexpr index_t Wi = 270;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 240;
-    constexpr index_t Wi = 135;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 540;
-    constexpr index_t Wi = 960;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 480;
-    constexpr index_t Wi = 270;
-    constexpr index_t K  = 16;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 8;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 16;
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-#elif 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 16;
-    constexpr index_t Hi = 1080;
-    constexpr index_t Wi = 1920;
-    constexpr index_t K  = 4;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-#endif

-    const index_t conv_stride_h   = 1;
-    const index_t conv_stride_w   = 1;
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
    const index_t conv_dilation_h = 1;
    const index_t conv_dilation_w = 1;
    const index_t in_left_pad_h   = 1;
@@ -193,7 +126,7 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif

-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
@@ -437,8 +370,6 @@ int main(int argc, char* argv[])
    }
 #endif

-    constexpr ck::index_t activ_type = 2;
-
 #if USE_CONV_FWD_V5R1_NCHW
    if(algo == ConvForwardAlgo::V5R1NCHW)
    {
@@ -449,22 +380,20 @@ int main(int argc, char* argv[])

        const auto tmp = f_make_for_device_nchw();

-#if 1
-        device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
-#else
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
-#endif
-            <in_data_t, 8, 8, activ_type, acc_data_t, out_data_t>(tmp[I0],
-                                                                  tmp[I1],
-                                                                  tmp[I2],
-                                                                  tmp[I3],
-                                                                  tmp[I4],
-                                                                  tmp[I5],
-                                                                  tmp[I6],
-                                                                  in,
-                                                                  wei,
-                                                                  out_device,
-                                                                  nrepeat);
+        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+                                                                             16,
+                                                                             acc_data_t,
+                                                                             out_data_t>(tmp[I0],
+                                                                                         tmp[I1],
+                                                                                         tmp[I2],
+                                                                                         tmp[I3],
+                                                                                         tmp[I4],
+                                                                                         tmp[I5],
+                                                                                         tmp[I6],
+                                                                                         in,
+                                                                                         wei,
+                                                                                         out_device,
+                                                                                         nrepeat);
    }
 #endif

@@ -524,15 +453,14 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_direct_convolution_activ(in,
-                                      wei,
-                                      out_host,
-                                      make_tuple(conv_stride_h, conv_stride_w),
-                                      make_tuple(conv_dilation_h, conv_dilation_w),
-                                      make_tuple(in_left_pad_h, in_left_pad_w),
-                                      make_tuple(in_right_pad_h, in_right_pad_w),
-                                      activ_type,
-                                      layout);
+        host_direct_convolution(in,
+                                wei,
+                                out_host,
+                                make_tuple(conv_stride_h, conv_stride_w),
+                                make_tuple(conv_dilation_h, conv_dilation_w),
+                                make_tuple(in_left_pad_h, in_left_pad_w),
+                                make_tuple(in_right_pad_h, in_right_pad_w),
+                                layout);

        check_error(out_host, out_device);


--- a/host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
+
+template <typename TInWei,
+          ck::index_t InWeiVectorSize,
+          ck::index_t OutVectorSize,
+          ck::index_t activ_type,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename AddLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_static_convolution_add_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const AddLengths& add_n_k_hox2_wox2_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    const Tensor<TOut>& add_n_k_hox2_wox2,
+    Tensor<TOut>& out_n_k_hox2_wox2,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = out_n_k_ho_wo_lengths[I0];
+    const auto K = out_n_k_ho_wo_lengths[I1];
+    const auto C = wei_k_c_y_x_lengths[I1];
+
+    const auto Hi = in_n_c_hi_wi_lengths[I2];
+    const auto Wi = in_n_c_hi_wi_lengths[I3];
+
+    const auto Ho = out_n_k_ho_wo_lengths[I2];
+    const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+    const auto Hox2 = Ho * 2;
+    const auto Wox2 = Wo * 2;
+
+    const auto Y = wei_k_c_y_x_lengths[I2];
+    const auto X = wei_k_c_y_x_lengths[I3];
+
+    const auto C0 = C / Number<InWeiVectorSize>{};
+    const auto C1 = Number<InWeiVectorSize>{};
+
+    const auto K0 = K / Number<OutVectorSize>{};
+    const auto K1 = Number<OutVectorSize>{};
+
+    Tensor<TInWei> in_n_c0_hi_wi_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
+    Tensor<TInWei> wei_k_c0_y_x_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
+    Tensor<TOut> out_n_k0_hox2_wox2_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Hox2, Wox2, K1}));
+    Tensor<TOut> add_n_k0_hox2_wox2_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Hox2, Wox2, K1}));
+
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+
+    auto f_nchx2wx2_to_nc0hx2wx2c1 = [&](auto n, auto ho, auto wo, auto c) {
+        add_n_k0_hox2_wox2_k1(n, c / InWeiVectorSize, ho, wo, c % InWeiVectorSize) =
+            add_n_k_hox2_wox2(n, c, ho, wo);
+    };
+
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+    make_ParallelTensorFunctor(f_nchx2wx2_to_nc0hx2wx2c1, N, Hox2, Wox2, K)();
+
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               out_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+
+    const auto in_n_c0_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
+    const auto wei_k_c0_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
+    const auto add_n_k0_hox2_wox2_k1_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2, K1));
+
+    // cdata = 64, BlockSize = 64, 16x8x32x4
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t KPerBlock  = K;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = C0;
+
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<Y * X, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
+
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+
+    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
+
+    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
+
+    constexpr auto conv_driver =
+        DriverStaticConvolutionAddForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            BThreadTransferSrcScalarPerVector_W,
+            CThreadTransferDstScalarPerVector_W>{};
+
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
+                    add_n_k0_hox2_wox2_k1_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    Number<activ_type>{},
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(out_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()));
+
+    out_n_k0_hox2_wox2_k1_device_buf.FromDevice(out_n_k0_hox2_wox2_k1.mData.data());
+
+    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_n_k_hox2_wox2(n, k, ho, wo) =
+            out_n_k0_hox2_wox2_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
+    };
+
+    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Hox2, Wox2)();
+}
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -188,6 +188,115 @@ void host_direct_convolution_activ(const Tensor<TIn>& in,
    }
 }

+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_add(const Tensor<TIn>& in,
+                                 const Tensor<TWei>& wei,
+                                 const Tensor<TOut>& add,
+                                 Tensor<TOut>& out,
+                                 const ConvStrides& conv_strides,
+                                 const ConvDilations& conv_dilations,
+                                 const InLeftPads& in_left_pads,
+                                 const InRightPads& in_right_pads,
+                                 const ck::index_t activ_type,
+                                 const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+
+        index_t hox2 = ho * 2;
+        index_t wox2 = wo * 2;
+
+        v = activ(v, activ_type);
+
+        out(n, k, hox2, wox2)         = v + add(n, k, hox2, wox2);
+        out(n, k, hox2, wox2 + 1)     = v + add(n, k, hox2, wox2 + 1);
+        out(n, k, hox2 + 1, wox2)     = v + add(n, k, hox2 + 1, wox2);
+        out(n, k, hox2 + 1, wox2 + 1) = v + add(n, k, hox2 + 1, wox2 + 1);
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+
+        index_t hox2 = ho * 2;
+        index_t wox2 = wo * 2;
+
+        v = activ(v, activ_type);
+
+        out(n, k, hox2, wox2)         = v + add(n, k, hox2, wox2);
+        out(n, k, hox2, wox2 + 1)     = v + add(n, k, hox2, wox2 + 1);
+        out(n, k, hox2 + 1, wox2)     = v + add(n, k, hox2 + 1, wox2);
+        out(n, k, hox2 + 1, wox2 + 1) = v + add(n, k, hox2 + 1, wox2 + 1);
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2] / 2,
+                                   out.mDesc.GetLengths()[3] /
+                                       2)(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2] / 2,
+                                   out.mDesc.GetLengths()[3] /
+                                       2)(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
 template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,

--- a/script/run.sh
+++ b/script/run.sh
@@ -12,7 +12,7 @@
 #export OLC_DEBUG_HIP_DUMP=1
 #export OLC_DEBUG_SAVE_TEMP_DIR=1

- make -j conv_fwd_driver_offline
+ make -j conv_add_fwd_driver_offline
 #make -j conv_bwd_driver_offline
 #make -j conv_fwd_driver_online

@@ -26,7 +26,7 @@ INIT=$4
 LOG=$5
 REPEAT=$6

-./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT
+./host/driver_offline/conv_add_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT

 ################################################ layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1