make static v5r1

78740e43 · Jing Zhang · 12649254 · 78740e43 · 78740e43 · 78740e43
Commit 78740e43 authored Jul 22, 2021 by Jing Zhang
8 changed files
--- a/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+++ b/composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+#ifndef CK_DRIVER_STATIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
+#define CK_DRIVER_STATIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_static_gemm_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+namespace ck {
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          index_t ABlockTransferSrcScalarPerVector_E,
+          index_t ABlockTransferDstScalarPerVector_K,
+          index_t BThreadTransferSrcScalarPerVector_W,
+          index_t CThreadTransferDstScalarPerVector_W>
+struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
+{
+    template <typename... Wei,
+              typename... In,
+              typename... Out,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename InLeftPads,
+              typename InRightPads>
+    __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+                      const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+                      const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_global_desc,
+                      const ConvStrides& conv_strides,
+                      const ConvDilations& conv_dilations,
+                      const InLeftPads& in_left_pads,
+                      const InRightPads& in_right_pads_,
+                      const FloatAB* __restrict__ p_wei_global,
+                      const FloatAB* __restrict__ p_in_global,
+                      FloatC* __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
+        const auto N_  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        const auto C_  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        const auto K0_ = out_n_k0_ho_wo_global_desc.GetLength(I1);
+        const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
+        const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
+        const auto Ho_ = out_n_k0_ho_wo_global_desc.GetLength(I2);
+        const auto Wo_ = out_n_k0_ho_wo_global_desc.GetLength(I3);
+        const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
+        const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
+        const auto X_ = wei_k_c_y_x_global_desc.GetLength(I3);
+        constexpr auto N  = Number<N_>{};
+        constexpr auto C  = Number<C_>{};
+        constexpr auto K0 = Number<K0_>{};
+        constexpr auto Hi = Number<Hi_>{};
+        constexpr auto Wi = Number<Wi_>{};
+        constexpr auto Ho = Number<Ho_>{};
+        constexpr auto Wo = Number<Wo_>{};
+        constexpr auto K = Number<K_>{};
+        constexpr auto Y = Number<Y_>{};
+        constexpr auto X = Number<X_>{};
+        const auto ConvStrideH_ = conv_strides[I0];
+        const auto ConvStrideW_ = conv_strides[I1];
+        const auto ConvDilationH_ = conv_dilations[I0];
+        const auto ConvDilationW_ = conv_dilations[I1];
+        constexpr auto ConvStrideH = Number<ConvStrideH_>{};
+        constexpr auto ConvStrideW = Number<ConvStrideW_>{};
+        constexpr auto ConvDilationH = Number<ConvDilationH_>{};
+        constexpr auto ConvDilationW = Number<ConvDilationW_>{};
+        constexpr auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
+        constexpr auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
+        constexpr auto OutRightPadH = Hop - Ho;
+        constexpr auto OutRightPadW = Wop - Wo;
+        const auto InLeftPadH_ = in_left_pads[I0];
+        const auto InLeftPadW_ = in_left_pads[I1];
+        constexpr auto InLeftPadH = Number<InLeftPadH_>{};
+        constexpr auto InLeftPadW = Number<InLeftPadW_>{};
+        constexpr auto in_right_pads = InRightPads{};
+        const auto InRightPadH_ = in_right_pads[I0] + OutRightPadH * ConvStrideH;
+        const auto InRightPadW_ = in_right_pads[I1] + OutRightPadW * ConvStrideW;
+        constexpr auto InRightPadH = Number<InRightPadH_>{};
+        constexpr auto InRightPadW = Number<InRightPadW_>{};
+        std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
+                  << std::endl;
+        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
+                  << std::endl;
+        // weight tensor
+        const auto wei_e_k_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+            make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}));
+        static_assert(wei_e_k_global_desc.IsKnownAtCompileTime(),
+                      "wrong! wei_e_k_global_desc need to known at compile-time");
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(C),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        static_assert(in_n_c_hip_wip_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_n_c_hip_wip_global_desc need to known at compile-time");
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_pass_through_transform(C),
+                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+        static_assert(in_n_c_y_ho_x_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_n_c_y_ho_x_wo_global_desc need to known at compile-time");
+        const auto in_e_n_ho_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                       make_pass_through_transform(N),
+                       make_pass_through_transform(Hop),
+                       make_pass_through_transform(Wop)),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        static_assert(in_e_n_ho_wo_global_desc.IsKnownAtCompileTime(),
+                      "wrong! in_e_n_ho_wo_global_desc need to known at compile-time");
+        // output tensor
+        const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo)),
+            make_tuple(make_pass_through_transform(K0),
+                       make_pass_through_transform(N),
+                       make_pad_transform(Ho, I0, OutRightPadH),
+                       make_pad_transform(Wo, I0, OutRightPadW)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
+                      "wrong! out_k_n_hop_wop_global_desc need to known at compile-time");
+        const auto E = C * Y * X;
+        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
+        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
+             (E % EPerBlock) == 0))
+        {
+            throw std::runtime_error("wrong! GEMM size no divisible");
+        }
+        // hack to control index calculation when iterating over a_k_m_global tensor
+        constexpr auto a_e_k_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
+        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
+        // hack for NKHW format
+        constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0>{}));
+        // GEMM
+        using gridwise_gemm = GridwiseStaticGemm_km_kn_mn_v3<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            FloatC,
+            InMemoryDataOperation::Set,
+            decltype(wei_e_k_global_desc),
+            decltype(in_e_n_ho_wo_global_desc),
+            decltype(out_k_n_hop_wop_global_desc),
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            EPerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadClusterLengths_E_K,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            0,
+            ABlockTransferSrcScalarPerVector_E,
+            ABlockTransferDstScalarPerVector_K,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<0, 2, 3, 1>,
+            3,
+            BThreadTransferSrcScalarPerVector_W,
+            false, // don't move back src coordinate after threadwise copy, which will be fused with
+                   // MoveSrcSliceWindow() to save addr computation
+            Sequence<0, 2, 3, 1>,
+            0,
+            CThreadTransferDstScalarPerVector_W,
+            decltype(a_e_k_global_iterator_hacks),
+            decltype(b_e_n_ho_wo_global_iterator_hacks),
+            decltype(c_k_n_ho_wo_global_tensor_iterator_hacks),
+            decltype(a_e_k_global_move_slice_window_iterator_hack),
+            decltype(b_e_n_ho_wo_global_move_slice_window_iterator_hack)>;
+        const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
+        constexpr bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
+        constexpr bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
+        index_t nrepeat = 100;
+#if 1
+        for(index_t i = 0; i < 5; ++i)
+        {
+            std::cout << "Start running " << nrepeat << " times..." << std::endl;
+            KernelTimer timer;
+            timer.Start();
+            std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
+                      << " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
+                      << std::endl;
+            for(index_t j = 0; j < nrepeat; ++j)
+            {
+                if constexpr(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, true>{});
+                }
+                else if constexpr(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, true>,
+                                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if constexpr(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, true>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    const auto kernel = run_gridwise_operation<gridwise_gemm,
+                                                               const FloatAB*,
+                                                               const FloatAB*,
+                                                               FloatC*,
+                                                               integral_constant<bool, false>,
+                                                               integral_constant<bool, false>>;
+                    launch_kernel(kernel,
+                                  dim3(GridSize),
+                                  dim3(BlockSize),
+                                  0,
+                                  0,
+                                  p_wei_global,
+                                  p_in_global,
+                                  p_out_global,
+                                  integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+            timer.End();
+            float ave_time = timer.GetElapsedTime() / nrepeat;
+            float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
+                                                            wei_k_c_y_x_global_desc,
+                                                            out_n_k0_ho_wo_global_desc) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+#endif
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+#ifndef CK_GRIDWISE_STATIC_GEMM_V2_HPP
+#define CK_GRIDWISE_STATIC_GEMM_V2_HPP
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "blockwise_gemm_v3.hpp"
+namespace ck {
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseStaticGemm_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto E = EPerBlock * 3 * 3;
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+        return a_block_space_size * sizeof(FloatAB);
+    }
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const FloatAB* __restrict__ p_a_global,
+                        const FloatAB* __restrict__ p_b_global,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+        constexpr auto a_e_k_global_desc       = AGlobalDesc{};
+        constexpr auto b_e_n_ho_wo_global_desc = BGlobalDesc{};
+        constexpr auto c_k_n_ho_wo_global_desc = CGlobalDesc{};
+        const auto a_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+        constexpr auto E = a_e_k_global_desc.GetLength(I0);
+        constexpr auto K = a_e_k_global_desc.GetLength(I1);
+        constexpr auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        constexpr auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        constexpr auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+        // divide block work by [M, N]
+#if 0
+        const auto k_block_work_num   = K / Number<KPerBlock>{};
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        auto blockwise_gemm = BlockwiseGemm_km_kn_m0m1n0n1_v3<BlockSize,
+                                                              FloatAB,
+                                                              FloatAB,
+                                                              FloatAcc,
+                                                              decltype(a_e_k_block_desc),
+                                                              decltype(b_e_n_ho_wo_block_desc),
+                                                              decltype(c_k_n_ho_wo_thread_desc),
+                                                              KPerThread,
+                                                              HoPerThread,
+                                                              WoPerThread,
+                                                              EPerThread,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              ABlockTransferDstScalarPerVector_K>{};
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<E, KPerBlock>,
+                                                   ABlockTransferThreadSliceLengths_E_K,
+                                                   ABlockTransferThreadClusterLengths_E_K,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_e_k_global_desc),
+                                                   decltype(a_e_k_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   1,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_e_k_global_desc,
+                make_multi_index(0, k_block_data_on_global),
+                a_e_k_desc,
+                make_multi_index(0, 0));
+        constexpr auto b_e_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e_n_ho_wo_global_desc),
+            decltype(b_e_n_ho_wo_thread_desc),
+            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            1,
+            true>(b_e_n_ho_wo_global_desc,
+                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+        auto a_block_buf = make_dynamic_buffer<AddressSpace::Lds>(p_shared_block,
+                                                                  a_e_k_desc.GetElementSpaceSize());
+        // register allocation for output
+        StaticBuffer<AddressSpace::Vgpr, FloatAcc, c_k_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            c_thread_buf;
+        // initialize output thread tensor
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_k_n_ho_wo_thread_desc),
+                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpace::Vgpr, FloatAB, b_e_n_ho_wo_thread_desc.GetElementSpaceSize()>
+            b_thread_even_buf, b_thread_odd_buf;
+        // LDS double buffer: preload data
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_iterator_hacks);
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
+        }
+        __syncthreads();
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e_block_data_begin = 0;
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+                // LDS double buffer: GEMM on current data
+                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+                e_block_data_begin += 2 * EPerBlock;
+            } while(e_block_data_begin < E - 2 * EPerBlock);
+        }
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+        // output: register to global memory
+        {
+            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + k_thread_id * KPerThread;
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_k_n_ho_wo_thread_desc),
+                decltype(c_k_n_ho_wo_global_desc),
+                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>(
+                c_k_n_ho_wo_global_desc,
+                make_multi_index(
+                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                .Run(c_k_n_ho_wo_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_k_n_ho_wo_global_desc,
+                     c_global_buf,
+                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+        }
+    }
+    // pass tensor descriptor by reference
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const FloatAB* __restrict__ p_a_global,
+                        const FloatAB* __restrict__ p_b_global,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+        __shared__ FloatAB p_shared_block[shared_block_size];
+        Run(p_a_global,
+            p_b_global,
+            p_c_global,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+#if 0
+    // pass tensor descriptors by their pointers
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
+        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
+        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+    // pass tensor descriptors by void*
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const void* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const void* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const void* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
+        const auto b_e_n_ho_wo_global_desc =
+            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
+        const auto c_k_n_ho_wo_global_desc =
+            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+#endif
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -16,9 +16,9 @@
 // GPU ID
 #if 0
 #define CK_AMD_GPU_GFX906 1
-#elif 1
-#define CK_AMD_GPU_GFX908 1
 #elif 0
+#define CK_AMD_GPU_GFX908 1
+#elif 1
 #define CK_AMD_GPU_GFX1030 1
 #endif
@@ -93,7 +93,7 @@
 // experimental implementation
 #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 #endif
 #ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK

--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -15,15 +15,15 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_DYNAMIC_MODE 1
+#define USE_DYNAMIC_MODE 0
-#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NCHW 0
+#define USE_CONV_FWD_V5R1_NCHW 1
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -102,16 +102,16 @@ int main(int argc, char* argv[])
    const bool do_log             = atoi(argv[5]);
    const int nrepeat             = atoi(argv[6]);
-    constexpr index_t N  = 128;
+    constexpr index_t N  = 1;
-    constexpr index_t C  = 192;
+    constexpr index_t C  = 16;
-    constexpr index_t Hi = 71;
+    constexpr index_t Hi = 540;
-    constexpr index_t Wi = 71;
+    constexpr index_t Wi = 960;
-    constexpr index_t K  = 256;
+    constexpr index_t K  = 16;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
-    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_h   = 1;
-    const index_t conv_stride_w   = 2;
+    const index_t conv_stride_w   = 1;
    const index_t conv_dilation_h = 1;
    const index_t conv_dilation_w = 1;
    const index_t in_left_pad_h   = 1;
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif
-#if 1
+#if 0
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
@@ -380,20 +380,20 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nchw();
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
+        device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
-                                                                             16,
+                                                                            8,
-                                                                             acc_data_t,
+                                                                            acc_data_t,
-                                                                             out_data_t>(tmp[I0],
+                                                                            out_data_t>(tmp[I0],
-                                                                                         tmp[I1],
+                                                                                        tmp[I1],
-                                                                                         tmp[I2],
+                                                                                        tmp[I2],
-                                                                                         tmp[I3],
+                                                                                        tmp[I3],
-                                                                                         tmp[I4],
+                                                                                        tmp[I4],
-                                                                                         tmp[I5],
+                                                                                        tmp[I5],
-                                                                                         tmp[I6],
+                                                                                        tmp[I6],
-                                                                                         in,
+                                                                                        in,
-                                                                                         wei,
+                                                                                        wei,
-                                                                                         out_device,
+                                                                                        out_device,
-                                                                                         nrepeat);
+                                                                                        nrepeat);
    }
 #endif

--- a/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -96,25 +96,25 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
    // cdata = 64, BlockSize = 64, 16x8x32x4
    constexpr index_t BlockSize = 64;
-    constexpr index_t KPerBlock  = 16;
+    constexpr index_t KPerBlock  = K;
    constexpr index_t HoPerBlock = 8;
    constexpr index_t WoPerBlock = 32;
-    constexpr index_t EPerBlock  = 1;
+    constexpr index_t EPerBlock  = C0;
    constexpr index_t KPerThread  = KPerBlock;
    constexpr index_t HoPerThread = 2;
    constexpr index_t WoPerThread = 2;
    constexpr index_t EPerThread  = EPerBlock;
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<3, 1>;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
-    constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
+    constexpr index_t CThreadTransferDstScalarPerVector_W = 8;
    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
 #else

--- a/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+//#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+#include "driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
+template <typename TInWei,
+          ck::index_t InWeiVectorSize,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw"
+              << std::endl;
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto N = OutLengths{}[I0];
+    constexpr auto K = OutLengths{}[I1];
+    constexpr auto C = WeiLengths{}[I1];
+    constexpr auto Hi = InLengths{}[I2];
+    constexpr auto Wi = InLengths{}[I3];
+    constexpr auto Ho = OutLengths{}[I2];
+    constexpr auto Wo = OutLengths{}[I3];
+    constexpr auto Y = WeiLengths{}[I2];
+    constexpr auto X = WeiLengths{}[I3];
+    constexpr auto C0 = C / Number<InWeiVectorSize>{};
+    constexpr auto C1 = Number<InWeiVectorSize>{};
+    constexpr auto K0 = K / Number<InWeiVectorSize>{};
+    constexpr auto K1 = Number<InWeiVectorSize>{};
+#if 0
+    // run-time variables
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(InDesc::GetLengths()));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(WeiDesc::GetLengths()));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(OutDesc::GetLengths()));
+    const auto conv_strides   = to_multi_index(ConvStrides{});
+    const auto conv_dilations = to_multi_index(ConvDilations{});
+    const auto in_left_pads   = to_multi_index(InLeftPads{});
+    const auto in_right_pads  = to_multi_index(InRightPads{});
+#else
+    // compile-time variables
+    constexpr auto in_n_c0_hi_wi_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<N>{}, Number<C0>{}, Number<Hi>{}, Number<Wi>{}));
+    constexpr auto wei_k_c0_y_x_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<K>{}, Number<C0>{}, Number<Y>{}, Number<X>{}));
+    constexpr auto out_n_k0_ho_wo_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<N>{}, Number<K0>{}, Number<Ho>{}, Number<Wo>{}));
+    // constexpr auto conv_strides   = sequence_to_tuple_of_number(ConvStrides{});
+    // constexpr auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
+    // constexpr auto in_left_pads   = sequence_to_tuple_of_number(InLeftPads{});
+    // constexpr auto in_right_pads  = sequence_to_tuple_of_number(InRightPads{});
+#endif
+    Tensor<TInWei> in_n_c0_hi_wi_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
+    Tensor<TInWei> wei_k_c0_y_x_c1(
+        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
+    Tensor<TOut> out_n_k0_ho_wo_k1(
+        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
+    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
+        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
+            in_n_c_hi_wi(n, c, hi, wi);
+    };
+    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
+        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
+            wei_k_c_y_x(k, c, y, x);
+    };
+    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
+    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    // cdata = 64, BlockSize = 64, 16x8x32x4
+    constexpr index_t BlockSize = 64;
+    constexpr index_t KPerBlock  = K;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t EPerBlock  = C0;
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = EPerBlock;
+    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
+    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
+    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
+    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
+    constexpr index_t CThreadTransferDstScalarPerVector_W = 1;
+    std::cerr << "conv_fp16_nchwc" << C1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k"
+              << K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v"
+              << conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1] << "q"
+              << in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;
+    constexpr auto conv_driver =
+#if 0
+        DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
+#else
+        DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
+#endif
+        <BlockSize,
+         typename vector_type<TInWei, InWeiVectorSize>::type,
+         TAcc,
+         typename vector_type<TOut, InWeiVectorSize>::type,
+         KPerBlock,
+         HoPerBlock,
+         WoPerBlock,
+         EPerBlock,
+         KPerThread,
+         HoPerThread,
+         WoPerThread,
+         EPerThread,
+         ABlockTransferThreadSliceLengths_E_K,
+         ABlockTransferThreadClusterLengths_E_K,
+         ABlockTransferSrcScalarPerVector_E,
+         ABlockTransferDstScalarPerVector_K,
+         BThreadTransferSrcScalarPerVector_W,
+         CThreadTransferDstScalarPerVector_W>{};
+    conv_driver.Run(wei_k_c0_y_x_desc,
+                    in_n_c0_hi_wi_desc,
+                    out_n_k0_ho_wo_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TOut, InWeiVectorSize>::type*>(
+                        out_n_k_ho_wo_device_buf.GetDeviceBuffer()));
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+#if 1
+    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
+        out_n_k_ho_wo(n, k, ho, wo) =
+            out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
+    };
+    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
+#endif
+}
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,13 +3,13 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir
 cmake                                                                                                                              \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                      \
 -D CMAKE_BUILD_TYPE=Release                                                                                                        \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"           \
+-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx1030 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"           \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                          \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                     \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                  \

--- a/script/run.sh
+++ b/script/run.sh
@@ -13,8 +13,8 @@
 #export OLC_DEBUG_SAVE_TEMP_DIR=1
 make -j conv_fwd_driver_offline
- make -j conv_bwd_driver_offline
+ #make -j conv_bwd_driver_offline
- make -j conv_fwd_driver_online
+ #make -j conv_fwd_driver_online
 #rm -rf /root/_hip_binary_kernels_/
 #rm -rf /tmp/olCompile*
@@ -26,11 +26,13 @@ INIT=$4
 LOG=$5
 REPEAT=$6
+./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT
 ################################################ layout  algo  verify  init  log  repeat  N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  192 3 3  71   71     2 2       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3  71   71     2 2       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  256 1024 1 7  17   17     1 1       1 1      0 3       0 3
- ./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
+#./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  256  256 3 3  14   14     1 1       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128  128  128 3 3  14   14     1 1       1 1      1 1       1 1
 #./host/driver_offline/conv_fwd_driver_offline  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  256  512  512 3 3   7    7     1 1       1 1      1 1       1 1