changed blockwise and threadwise gemm to use new tensor descriptor

f6ec737c · Chao Liu · a7c587ee · a7c587ee · a7c587ee · a7c587ee
Commit f6ec737c authored Feb 20, 2021 by Chao Liu
12 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
-#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
-#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
-
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-template <typename... Wei, typename... In, typename... Out>
-__host__ __device__ constexpr auto
-map_convolution_into_gemm_fwd_v4r4(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-                                   const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                                   const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
-                                   const MultiIndex<2> conv_strides,
-                                   const MultiIndex<2> conv_dilations,
-                                   const MultiIndex<2> in_left_pads,
-                                   const MultiIndex<2> in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const index_t ConvStrideH = conv_strides[I0];
-    const index_t ConvStrideW = conv_strides[I1];
-
-    const index_t ConvDilationH = conv_dilations[I0];
-    const index_t ConvDilationW = conv_dilations[I1];
-
-    const index_t InLeftPadH  = in_left_pads[I0];
-    const index_t InLeftPadW  = in_left_pads[I1];
-    const index_t InRightPadH = in_right_pads[I0];
-    const index_t InRightPadW = in_right_pads[I1];
-
-    // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-        transform_dynamic_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicLeftPad{Hi, InLeftPadH},
-                       DynamicLeftPad{Wi, InLeftPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-        make_tuple(DynamicPassThrough{N},
-                   DynamicPassThrough{C},
-                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
-
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_hip_wip_global_desc,
-        make_tuple(
-            DynamicPassThrough{N},
-            DynamicPassThrough{C},
-            DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
-            DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmktotal_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_y_ho_x_wo_global_desc,
-        make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
-                   DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
-        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    const index_t gemmktotal = in_gemmktotal_gemmn_global_desc.GetLength(I0);
-    const index_t gemmn      = in_gemmktotal_gemmn_global_desc.GetLength(I1);
-
-    constexpr index_t GemmKPack = 8;
-    const index_t gemmk         = gemmktotal / GemmKPack;
-
-    const auto in_gemmk_gemmn_gemmkpack_global_desc = transform_dynamic_tensor_descriptor(
-        in_gemmktotal_gemmn_global_desc,
-        make_tuple(DynamicUnMerge<2>{make_multi_index(gemmk, GemmKPack)},
-                   DynamicPassThrough{gemmn}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-    static_assert(decltype(in_gemmk_gemmn_gemmkpack_global_desc)::GetNumOfDimension() == 3,
-                  "wrong!");
-
-    return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
-}
-
-#if 0
-template <typename... Wei, typename... In, typename... Out>
-__host__ __device__ constexpr auto map_convolution_into_gemm_bwd_v4r1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
-    const MultiIndex<2> conv_strides,
-    const MultiIndex<2> conv_dilations,
-    const MultiIndex<2> in_left_pads,
-    const MultiIndex<2> in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const index_t ConvStrideH = conv_strides[I0];
-    const index_t ConvStrideW = conv_strides[I1];
-
-    const index_t ConvDilationH = conv_dilations[I0];
-    const index_t ConvDilationW = conv_dilations[I1];
-
-    const index_t InLeftPadH  = in_left_pads[I0];
-    const index_t InLeftPadW  = in_left_pads[I1];
-    const index_t InRightPadH = in_right_pads[I0];
-    const index_t InRightPadW = in_right_pads[I1];
-
-#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
-        constexpr bool out_skip_out_of_bound_check = false;
-#else
-        constexpr bool out_skip_out_of_bound_check = true;
-#endif
-
-        constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor(
-            out_n_k_ho_wo_global_desc,
-            make_tuple(PassThrough{N},
-                       PassThrough{K},
-                       Embed<2>{make_multi_index(YDot, HTilda), make_multi_index(-ConvDilationH / GcdStrideDilationH, 1)},
-                       Embed<2>{make_multi_index(XDot, WTilda), make_multi_index(-ConvDilationW / GcdStrideDilationW, 1)}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-        constexpr auto out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc =
-            transform_tensor_descriptor(
-                out_n_k_ydot_htilda_xdot_wtilda_global_desc,
-                make_tuple(PassThrough{N},
-                           PassThrough{K},
-                           PassThrough{YDot},
-                           PassThrough{XDot},
-                           Slice<Sequence<HTilda, WTilda>,
-                                 Sequence<iHTildaLeft, iWTildaLeft>,
-                                 Sequence<iHTildaRight, iWTildaRight>>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}));
-
-        constexpr auto out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc =
-            transform_tensor_descriptor(
-                out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc,
-                make_tuple(
-                    PassThrough<N>{},
-                    PassThrough<K>{},
-                    PassThrough<HTildaSlice>{},
-                    PassThrough<WTildaSlice>{},
-                    Slice<Sequence<YDot, XDot>, Sequence<0, 0>, Sequence<YDotSlice, XDotSlice>>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}));
-
-        constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor(
-            out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc,
-            make_tuple(Merge<Sequence<K, YDotSlice, XDotSlice>>{},
-                       Merge<Sequence<N, HTildaSlice, WTildaSlice>>{}),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
-}
-#endif
-
-template <index_t BlockSize>
-struct DummyDynamicTransform_1
-{
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_1(index_t* const __restrict__ p_wei_global,
-                          float* const __restrict__ p_in_global,
-                          float* const __restrict__ p_out_global,
-                          const WeiDesc wei_k_c_y_x_global_desc,
-                          const InDesc in_n_c_hi_wi_global_desc,
-                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2> conv_strides,
-                          const MultiIndex<2> conv_dilations,
-                          const MultiIndex<2> in_left_pads,
-                          const MultiIndex<2> in_right_pads) const
-    {
-        const auto transformed_tensor_descs =
-            map_convolution_into_gemm(move(wei_k_c_y_x_global_desc),
-                                      move(in_n_c_hi_wi_global_desc),
-                                      move(out_n_k_ho_wo_global_desc),
-                                      conv_strides,
-                                      conv_dilations,
-                                      in_left_pads,
-                                      in_right_pads);
-
-        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
-
-        MultiIndex<2> idx;
-
-        // initialize idx
-        static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
-
-        const index_t niter = p_wei_global[10];
-
-        auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
-
-        const auto in_gemmk_gemmn_coord_iterator = make_dynamic_tensor_coordinate_iterator(
-            in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
-
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate(
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_iterator);
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_gemmk_gemmn_coord.GetOffset(),
-#if 1
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
-#else
-                             true,
-#endif
-                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
-        }
-    }
-
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_2(index_t* const __restrict__ p_wei_global,
-                          float* const __restrict__ p_in_global,
-                          float* const __restrict__ p_out_global,
-                          const WeiDesc wei_k_c_y_x_global_desc,
-                          const InDesc in_n_c_hi_wi_global_desc,
-                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2> conv_strides,
-                          const MultiIndex<2> conv_dilations,
-                          const MultiIndex<2> in_left_pads,
-                          const MultiIndex<2> in_right_pads) const
-    {
-        constexpr auto i0 = Number<0>{};
-        constexpr auto i1 = Number<1>{};
-        constexpr auto i2 = Number<2>{};
-        constexpr auto i3 = Number<3>{};
-
-        const index_t N = in_n_c_hi_wi_global_desc.GetLength(i0);
-        const index_t C = in_n_c_hi_wi_global_desc.GetLength(i1);
-        const index_t K = out_n_k_ho_wo_global_desc.GetLength(i1);
-
-        const index_t Y = wei_k_c_y_x_global_desc.GetLength(i2);
-        const index_t X = wei_k_c_y_x_global_desc.GetLength(i3);
-
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(i2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(i3);
-
-        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(i2);
-        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(i3);
-
-        const index_t ConvStrideH = conv_strides[i0];
-        const index_t ConvStrideW = conv_strides[i1];
-
-        const index_t ConvDilationH = conv_dilations[i0];
-        const index_t ConvDilationW = conv_dilations[i1];
-
-        const index_t InLeftPadH  = in_left_pads[i0];
-        const index_t InLeftPadW  = in_left_pads[i1];
-        const index_t InRightPadH = in_right_pads[i0];
-        const index_t InRightPadW = in_right_pads[i1];
-
-#if 0
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-            move(in_n_c_hi_wi_global_desc),
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicPassThrough{Hi},
-                       DynamicPassThrough{Wi}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-#elif 0
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-            move(in_n_c_hi_wi_global_desc),
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicLeftPad{Hi, InLeftPadH},
-                       DynamicLeftPad{Wi, InLeftPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-#else
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-            transform_dynamic_tensor_descriptor(
-                move(in_n_c_hi_wi_global_desc),
-                make_tuple(DynamicPassThrough{N},
-                           DynamicPassThrough{C},
-                           DynamicLeftPad{Hi, InLeftPadH},
-                           DynamicLeftPad{Wi, InLeftPadW}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                       DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-#endif
-
-        MultiIndex<4> idx;
-
-        // initialize idx
-        static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
-
-#if 1
-        const index_t niter = p_wei_global[10];
-
-        auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, idx);
-
-        const auto in_coord_iterator = make_dynamic_tensor_coordinate_iterator(
-            in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));
-
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, in_coord, in_coord_iterator);
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_coord.GetOffset(),
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                 in_n_c_hip_wip_global_desc, in_coord),
-                             in_n_c_hip_wip_global_desc.GetElementSpaceSize());
-        }
-#else
-        // write
-        // auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hi_wi_global_desc, idx);
-
-        p_out_global[in_n_c_hip_wip_global_desc.CalculateOffset(idx)] = 1;
-#endif
-    }
-
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        float* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
-                        const WeiDesc wei_k_c_y_x_global_desc,
-                        const InDesc in_n_c_hi_wi_global_desc,
-                        const OutDesc out_n_k_ho_wo_global_desc,
-                        const MultiIndex<2> conv_strides,
-                        const MultiIndex<2> conv_dilations,
-                        const MultiIndex<2> in_left_pads,
-                        const MultiIndex<2> in_right_pads) const
-    {
-        Run_2(p_wei_global,
-              p_in_global,
-              p_out_global,
-              wei_k_c_y_x_global_desc,
-              in_n_c_hi_wi_global_desc,
-              out_n_k_ho_wo_global_desc,
-              conv_strides,
-              conv_dilations,
-              in_left_pads,
-              in_right_pads);
-    }
-};
-
-template <index_t BlockSize>
-struct DummyDynamicTransform_fwd_v4r4
-{
-    template <typename TransformInDesc>
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        float* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
-                        const TransformInDesc in_gemmk_gemmn_gemmkpack_global_desc) const
-    {
-        MultiIndex<3> idx;
-
-        // initialize idx
-        static_for<0, 3, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
-
-        const index_t niter = p_wei_global[10];
-
-        auto in_gemmk_gemmn_gemmkpack_coord =
-            make_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, idx);
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(0, 0, 1));
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(0, 1, 0));
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(1, 0, 0));
-
-        // move (0, 0, 1)
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(
-                &value,
-                0,
-                true,
-                1,
-                p_out_global,
-                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
-#if 1
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
-#else
-                true,
-#endif
-                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
-        }
-
-        // move (0, 1, 0)
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(
-                &value,
-                0,
-                true,
-                1,
-                p_out_global,
-                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
-#if 1
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
-#else
-                true,
-#endif
-                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
-        }
-
-        // move (1, 0, 0)
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(
-                &value,
-                0,
-                true,
-                1,
-                p_out_global,
-                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
-#if 1
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
-#else
-                true,
-#endif
-                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
-        }
-    }
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
-#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
-#define CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
-
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor_v1.hpp"
-#include "dynamic_tensor_descriptor_helper_v1.hpp"
-#include "dynamic_tensor_coordinate_v1.hpp"
-
-namespace ck {
-
-template <typename WeiDesc, typename InDesc, typename OutDesc>
-__host__ __device__ constexpr auto
-map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
-                             const InDesc& in_n_c_hi_wi_global_desc,
-                             const OutDesc& out_n_k_ho_wo_global_desc,
-                             const MultiIndex<2>& conv_strides,
-                             const MultiIndex<2>& conv_dilations,
-                             const MultiIndex<2>& in_left_pads,
-                             const MultiIndex<2>& in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-    const index_t ConvStrideH = conv_strides[I0];
-    const index_t ConvStrideW = conv_strides[I1];
-
-    const index_t ConvDilationH = conv_dilations[I0];
-    const index_t ConvDilationW = conv_dilations[I1];
-
-    const index_t InLeftPadH  = in_left_pads[I0];
-    const index_t InLeftPadW  = in_left_pads[I1];
-    const index_t InRightPadH = in_right_pads[I0];
-    const index_t InRightPadW = in_right_pads[I1];
-
-    // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-        transform_dynamic_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicLeftPad{Hi, InLeftPadH},
-                       DynamicLeftPad{Wi, InLeftPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-        make_tuple(DynamicPassThrough{N},
-                   DynamicPassThrough{C},
-                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
-
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_hip_wip_global_desc,
-        make_tuple(
-            DynamicPassThrough{N},
-            DynamicPassThrough{C},
-            DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
-            DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_y_ho_x_wo_global_desc,
-        make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
-                   DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
-        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-    return make_tuple(in_gemmk_gemmn_global_desc);
-}
-
-template <index_t BlockSize>
-struct DummyDynamicTransform_v1
-{
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_1(index_t* const __restrict__ p_wei_global,
-                          float* const __restrict__ p_in_global,
-                          float* const __restrict__ p_out_global,
-                          const WeiDesc wei_k_c_y_x_global_desc,
-                          const InDesc in_n_c_hi_wi_global_desc,
-                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2> conv_strides,
-                          const MultiIndex<2> conv_dilations,
-                          const MultiIndex<2> in_left_pads,
-                          const MultiIndex<2> in_right_pads) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-#if 1
-        const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-        const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-        const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
-
-        const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-        const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-
-        const index_t ConvStrideH = conv_strides[I0];
-        const index_t ConvStrideW = conv_strides[I1];
-
-        const index_t ConvDilationH = conv_dilations[I0];
-        const index_t ConvDilationW = conv_dilations[I1];
-
-        const index_t InLeftPadH  = in_left_pads[I0];
-        const index_t InLeftPadW  = in_left_pads[I1];
-        const index_t InRightPadH = in_right_pads[I0];
-        const index_t InRightPadW = in_right_pads[I1];
-#else
-        const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-        const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-
-        const index_t Y = 3;
-        const index_t X = 3;
-
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-
-        const index_t ConvStrideH = conv_strides[I0];
-        const index_t ConvStrideW = conv_strides[I1];
-
-        const index_t ConvDilationH = conv_dilations[I0];
-        const index_t ConvDilationW = conv_dilations[I1];
-
-        const index_t InLeftPadH  = in_left_pads[I0];
-        const index_t InLeftPadW  = in_left_pads[I1];
-        const index_t InRightPadH = in_right_pads[I0];
-        const index_t InRightPadW = in_right_pads[I1];
-#endif
-
-        // define transform
-        // pass through
-        auto f_lower_idx_diff_passthrough = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
-            idx_low_diff = idx_up_diff;
-        };
-
-        // pad
-        auto f_lower_idx_diff_pad = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
-            idx_low_diff = idx_up_diff;
-        };
-
-        // embed
-        auto f_lower_idx_diff_embed = [](index_t& idx_low_diff,
-                                         const index_t& idx_up_diff_0,
-                                         const index_t& idx_up_diff_1,
-                                         const index_t coeff0,
-                                         const index_t coeff1) {
-            idx_low_diff = coeff0 * idx_up_diff_0 + coeff1 * idx_up_diff_1;
-        };
-
-        // unmerge
-        auto f_lower_idx_diff_unmerge = [](index_t& idx_low_diff,
-                                           const index_t& idx_up_diff_0,
-                                           const index_t& idx_up_diff_1,
-                                           const index_t up_length_1) {
-            idx_low_diff = up_length_1 * idx_up_diff_0 + idx_up_diff_1;
-        };
-
-        // merge
-        auto f_lower_idx_diff_merge_v1 = [](index_t& idx_low_diff_0,
-                                            index_t& idx_low_diff_1,
-                                            index_t& idx_low_diff_2,
-                                            const index_t& idx_up_diff,
-                                            const index_t& idx_low_old_0,
-                                            const index_t& idx_low_old_1,
-                                            const index_t& idx_low_old_2,
-                                            const index_t& idx_low_diff_const_0,
-                                            const index_t& idx_low_diff_const_1,
-                                            const index_t& idx_low_diff_const_2,
-                                            const index_t& idx_low_bound_0,
-                                            const index_t& idx_low_bound_1,
-                                            const index_t& idx_low_bound_2) {
-            auto f_carry_arithmetic = [](index_t& idx_low_diff,
-                                         index_t& carry,
-                                         const index_t& idx_low_old,
-                                         const index_t& idx_low_diff_const,
-                                         const index_t& idx_low_bound) {
-                index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
-
-#if 1 // positive
-                bool do_carry = idx_low_tmp >= idx_low_bound;
-
-                index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
-
-                carry = do_carry ? 1 : 0;
-#else // negative
-                bool do_borrow = idx_low_tmp < 0;
-
-                index_t idx_low_new = do_borrow ? idx_low_tmp + idx_low_bound : idx_low_tmp;
-
-                carry          = do_borrow ? -1 : 0;
-#endif
-
-                idx_low_diff = idx_low_new - idx_low_old;
-            };
-
-            index_t carry = 0;
-
-            f_carry_arithmetic(
-                idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
-            f_carry_arithmetic(
-                idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
-
-            idx_low_diff_0 = idx_low_diff_const_0 + carry;
-        };
-
-        auto f_lower_idx_diff_merge_v2 = [](index_t& idx_low_diff_0,
-                                            index_t& idx_low_diff_1,
-                                            index_t& idx_low_diff_2,
-                                            const index_t& idx_up_diff,
-                                            const index_t& idx_low_old_0,
-                                            const index_t& idx_low_old_1,
-                                            const index_t& idx_low_old_2,
-                                            const index_t& idx_low_diff_const_0,
-                                            const index_t& idx_low_diff_const_1,
-                                            const index_t& idx_low_diff_const_2,
-                                            const index_t& idx_low_bound_0,
-                                            const index_t& idx_low_bound_1,
-                                            const index_t& idx_low_bound_2) {
-            auto f_carry_arithmetic = [](index_t& idx_low_diff,
-                                         index_t& carry,
-                                         const index_t& idx_low_old,
-                                         const index_t& idx_low_diff_const,
-                                         const index_t& idx_low_bound) {
-                index_t idx_low_tmp                            = idx_low_old + carry;
-                index_t idx_low_bound_minus_idx_low_diff_const = idx_low_bound - idx_low_diff_const;
-
-#if 1 // positive
-                bool do_carry = idx_low_tmp >= idx_low_bound_minus_idx_low_diff_const;
-
-                idx_low_diff =
-                    do_carry ? -idx_low_bound_minus_idx_low_diff_const : idx_low_diff_const;
-
-                idx_low_diff += carry;
-
-                carry = do_carry ? 1 : 0;
-#else // negative
-                bool do_borrow = idx_low_tmp < -idx_low_diff_const;
-
-                idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
-
-                idx_low_diff -= carry;
-
-                carry          = do_borrow ? 1 : carry;
-#endif
-            };
-
-            index_t carry = 0;
-
-            f_carry_arithmetic(
-                idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
-            f_carry_arithmetic(
-                idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
-
-            idx_low_diff_0 = idx_low_diff_const_0 + carry;
-        };
-
-        auto f_lower_idx_diff_merge_v3 = [](index_t& idx_low_diff_0,
-                                            index_t& idx_low_diff_1,
-                                            index_t& idx_low_diff_2,
-                                            const index_t& idx_up_diff,
-                                            const index_t& idx_low_old_0,
-                                            const index_t& idx_low_old_1,
-                                            const index_t& idx_low_old_2,
-                                            const index_t& idx_low_diff_const_0,
-                                            const index_t& idx_low_diff_const_1,
-                                            const index_t& idx_low_diff_const_2,
-                                            const index_t& idx_low_bound_0,
-                                            const index_t& idx_low_bound_1,
-                                            const index_t& idx_low_bound_2) {
-            auto f_carry_arithmetic = [](index_t& idx_low_diff,
-                                         index_t& negative_carry,
-                                         const index_t& idx_low_old,
-                                         const index_t& idx_low_diff_const,
-                                         const index_t& idx_low_bound) {
-                index_t neg_idx_low_tmp = negative_carry - idx_low_old;
-
-                index_t idx_low_diff_const_minus_idx_low_bound = idx_low_diff_const - idx_low_bound;
-
-#if 1
-                bool do_carry = neg_idx_low_tmp <= idx_low_diff_const_minus_idx_low_bound;
-
-                idx_low_diff =
-                    do_carry ? idx_low_diff_const_minus_idx_low_bound : idx_low_diff_const;
-
-                idx_low_diff -= negative_carry;
-
-                negative_carry = do_carry ? -1 : 0;
-#else
-                bool do_borrow = neg_idx_low_tmp > idx_low_diff_const;
-
-                idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
-
-                idx_low_diff -= negative_carry;
-
-                negative_carry = do_borrow ? 1 : negative_carry;
-#endif
-            };
-
-            index_t negative_carry = 0;
-
-            f_carry_arithmetic(idx_low_diff_2,
-                               negative_carry,
-                               idx_low_old_2,
-                               idx_low_diff_const_2,
-                               idx_low_bound_2);
-            f_carry_arithmetic(idx_low_diff_1,
-                               negative_carry,
-                               idx_low_old_1,
-                               idx_low_diff_const_1,
-                               idx_low_bound_1);
-
-            idx_low_diff_0 = idx_low_diff_const_0 - negative_carry;
-        };
-
-        index_t idx[20];
-        index_t idx_diff[20];
-        index_t const_tmp[6];
-
-        // populate const
-        const index_t GemmKPack = p_wei_global[0];
-
-#if 1
-        for(index_t i = 0; i < 6; ++i)
-        {
-            const_tmp[i] = p_wei_global[i + 1];
-        }
-#else
-        const_tmp[0] = 0;
-        const_tmp[1] = 2;
-        const_tmp[2] = 2;
-#endif
-
-        // initialize idx
-        for(index_t i = 0; i < 20; ++i)
-        {
-            idx[i] = p_wei_global[get_thread_local_1d_id() + i];
-        }
-
-        // offset
-        idx[0] = idx[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
-                 idx[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
-                 idx[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
-                 idx[4] * in_n_c_hi_wi_global_desc.GetStride(3);
-
-// start lowering diff
-#pragma unroll 1
-        for(index_t i = 0; i < 100; ++i)
-        {
-            for(index_t i = 0; i < 20; ++i)
-            {
-                idx_diff[i] = 0;
-            }
-
-            idx_diff[17] = 8;
-
-            // stage 4
-            // Unmerge(GemmKTotal) => GemmK, GemmKPack
-            f_lower_idx_diff_unmerge(idx_diff[15], idx_diff[17], idx_diff[18], GemmKPack);
-
-            // PassThrough GemmN => GemmN
-            f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
-
-// stage 3
-// Merge(C, Y, X) => GemmKTotal
-#if 0
-            f_lower_idx_diff_merge_v2(idx_diff[10],
-                                      idx_diff[11],
-                                      idx_diff[13],
-                                      idx_diff[15],
-                                      idx[10],
-                                      idx[11],
-                                      idx[13],
-                                      const_tmp[0],
-                                      const_tmp[1],
-                                      const_tmp[2],
-                                      C,
-                                      Y,
-                                      X);
-#elif 1
-            index_t tmp               = idx_diff[15];
-            const index_t const_tmp_0 = tmp / (Y * X);
-            tmp -= const_tmp_0 * (Y * X);
-            const index_t const_tmp_1 = tmp / X;
-            const index_t const_tmp_2 = tmp - const_tmp_1 * X;
-
-            f_lower_idx_diff_merge_v2(idx_diff[10],
-                                      idx_diff[11],
-                                      idx_diff[13],
-                                      idx_diff[15],
-                                      idx[10],
-                                      idx[11],
-                                      idx[13],
-                                      const_tmp_0,
-                                      const_tmp_1,
-                                      const_tmp_2,
-                                      C,
-                                      Y,
-                                      X);
-#elif 0
-            index_t tmp               = idx_diff[15];
-            const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X));
-            tmp -= const_tmp_0 * (Y * X);
-            const index_t const_tmp_1 = __llvm_amdgcn_readfirstlane_i32(tmp / X);
-            const index_t const_tmp_2 = __llvm_amdgcn_readfirstlane_i32(tmp - const_tmp_1 * X);
-
-            f_lower_idx_diff_merge_v2(idx_diff[10],
-                                      idx_diff[11],
-                                      idx_diff[13],
-                                      idx_diff[15],
-                                      idx[10],
-                                      idx[11],
-                                      idx[13],
-                                      const_tmp_0,
-                                      const_tmp_1,
-                                      const_tmp_2,
-                                      C,
-                                      Y,
-                                      X);
-#endif
-
-            // stage 2
-            // PassThrough(N) => N
-            f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
-
-            // PassThrough(C) => C
-            f_lower_idx_diff_passthrough(idx_diff[6], idx_diff[10]);
-
-            // Embed(Hip) => Y, Ho
-            f_lower_idx_diff_embed(
-                idx_diff[7], idx_diff[11], idx_diff[12], ConvDilationH, ConvStrideH);
-
-            // Embed(Wip) => X, Wo
-            f_lower_idx_diff_embed(
-                idx_diff[8], idx_diff[13], idx_diff[14], ConvDilationW, ConvStrideW);
-
-            // stage 1
-            // PassThrough(N) => N
-            f_lower_idx_diff_passthrough(idx_diff[1], idx_diff[5]);
-
-            // PassThrough(C) => C
-            f_lower_idx_diff_passthrough(idx_diff[2], idx_diff[6]);
-
-            // Pad(Hi) => Hip
-            f_lower_idx_diff_pad(idx_diff[3], idx_diff[7]);
-
-            // Pad(Wi) => Wip
-            f_lower_idx_diff_pad(idx_diff[4], idx_diff[8]);
-
-            // stage 0
-            // offset_diff
-            idx_diff[0] = idx_diff[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
-                          idx_diff[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
-                          idx_diff[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
-                          idx_diff[4] * in_n_c_hi_wi_global_desc.GetStride(3);
-
-#if 0
-            // update idx
-            for(index_t i = 0; i < 20; ++ i)
-            {
-                idx[i] += idx_diff[i];
-            }
-
-            // padding check
-            bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
-#elif 0 // no pad
-        // offset
-            idx[0] += idx_diff[0];
-
-            // C, Y, X
-            idx[10] += idx_diff[10];
-            idx[11] += idx_diff[11];
-            idx[13] += idx_diff[13];
-
-            // padding check
-            bool is_in_bound = true;
-#else   // pad
-        // offset
-            idx[0] += idx_diff[0];
-
-            // C, Y, X
-            idx[10] += idx_diff[10];
-            idx[11] += idx_diff[11];
-            idx[13] += idx_diff[13];
-
-            // Hi, Wi
-            idx[3] += idx_diff[3];
-            idx[4] += idx_diff[4];
-
-            // padding check
-            bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
-#endif
-
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             idx[0],
-                             is_in_bound,
-                             out_n_k_ho_wo_global_desc.GetElementSpace());
-        }
-    }
-
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_2(index_t* const __restrict__ p_wei_global,
-                          float* const __restrict__ p_in_global,
-                          float* const __restrict__ p_out_global,
-                          const WeiDesc wei_k_c_y_x_global_desc,
-                          const InDesc in_n_c_hi_wi_global_desc,
-                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2> conv_strides,
-                          const MultiIndex<2> conv_dilations,
-                          const MultiIndex<2> in_left_pads,
-                          const MultiIndex<2> in_right_pads) const
-    {
-        const auto transformed_tensor_descs =
-            map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
-                                         in_n_c_hi_wi_global_desc,
-                                         out_n_k_ho_wo_global_desc,
-                                         conv_strides,
-                                         conv_dilations,
-                                         in_left_pads,
-                                         in_right_pads);
-
-        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
-
-        MultiIndex<2> idx;
-
-        static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
-
-        const index_t niter = p_wei_global[10];
-
-        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate_v1(in_gemmk_gemmn_global_desc, idx);
-
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            constexpr auto gemmk1_gemmn0 = make_multi_index(1, 0);
-
-            in_gemmk_gemmn_coord += gemmk1_gemmn0;
-
-            // write
-            float value = 1;
-
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_gemmk_gemmn_coord.GetOffset(),
-#if 1
-                             in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
-#else
-                             true,
-#endif
-                             in_gemmk_gemmn_global_desc.GetElementSpace());
-        }
-    }
-
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        float* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
-                        const WeiDesc wei_k_c_y_x_global_desc,
-                        const InDesc in_n_c_hi_wi_global_desc,
-                        const OutDesc out_n_k_ho_wo_global_desc,
-                        const MultiIndex<2> conv_strides,
-                        const MultiIndex<2> conv_dilations,
-                        const MultiIndex<2> in_left_pads,
-                        const MultiIndex<2> in_right_pads) const
-    {
-        Run_2(p_wei_global,
-              p_in_global,
-              p_out_global,
-              wei_k_c_y_x_global_desc,
-              in_n_c_hi_wi_global_desc,
-              out_n_k_ho_wo_global_desc,
-              conv_strides,
-              conv_dilations,
-              in_left_pads,
-              in_right_pads);
-    }
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
-#ifndef CK_DUMMY_STATIC_TRANSFORM_HPP
-#define CK_DUMMY_STATIC_TRANSFORM_HPP
-
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-
-namespace ck {
-
-// GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
-template <index_t GridSize,
-          index_t BlockSize,
-          typename Float,
-          typename InGlobalDesc,
-          typename WeiGlobalDesc,
-          typename OutGlobalDesc,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-struct DummyStaticTransform
-{
-    __device__ void Run(Float* const __restrict__ p_in_global,
-                        Float* const __restrict__ p_wei_global,
-                        Float* const __restrict__ p_out_global) const
-    {
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
-        constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
-        constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
-
-        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLengths()[0];
-        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLengths()[1];
-        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2];
-        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3];
-
-        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLengths()[1];
-        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2];
-        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3];
-
-        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
-        constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
-
-        constexpr index_t ConvStrideH = ConvStrides{}[0];
-        constexpr index_t ConvStrideW = ConvStrides{}[1];
-
-        constexpr index_t ConvDilationH = ConvDilations{}[0];
-        constexpr index_t ConvDilationW = ConvDilations{}[1];
-
-        // weight tensor
-        constexpr auto wei_gemmk_gemmm_global_desc = reorder_tensor_descriptor_given_upper2lower(
-            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
-
-        // input tensor
-        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(PassThrough<N>{},
-                       PassThrough<C>{},
-                       Pad<Sequence<Hi, Wi>, InLeftPads, InRightPads>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
-
-        constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2];
-        constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3];
-
-        constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(PassThrough<N>{},
-                       PassThrough<C>{},
-                       Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
-                       Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-        constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
-            in_n_c_y_ho_x_wo_global_desc,
-            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        // output tensor
-        constexpr auto out_gemmm_gemmn_global_desc =
-            transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
-                                        make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
-                                        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        // input
-        const index_t k0 = p_in_global[get_thread_local_1d_id()];
-        const index_t n0 = p_in_global[get_thread_local_1d_id()];
-
-        auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0);
-
-#pragma unroll 1
-        for(index_t k = 0; k < 100; ++k)
-        {
-            coord += make_multi_index(8, 0);
-
-            Float value = 1;
-            transfer_data<Float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_in_global,
-                             coord.GetOffset(),
-                             coord.IsOffsetValidAssumingUpperIndexIsValid(),
-                             in_gemmk_gemmn_global_desc.GetElementSpace());
-        }
-    }
-};
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -95,28 +95,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                           level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
    }

-#if 0
-    __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c,
-                                                                      index_t n_in_c)
-    {
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
-
-        index_t m_repeat = m_in_c / MPerThreadSubC;
-        index_t n_repeat = n_in_c / NPerThreadSubC;
-
-        index_t m_in_sub_c = m_in_c % MPerThreadSubC;
-        index_t n_in_sub_c = n_in_c % NPerThreadSubC;
-
-        return MatrixIndex{m_repeat * MPerLevel1Cluster + m_in_sub_c,
-                           n_repeat * NPerLevel1Cluster + n_in_sub_c};
-    }
-#endif
-
    template <typename FloatA, typename FloatB, typename FloatC>
    __device__ void
    Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
@@ -352,5 +330,366 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    }
 };

+// blockwise GEMM: C += transpose(A) * B
+// A and B are visable to the whole block, C is distributed among each thread
+// If following number are power of 2, index calculation shall be greatly reduced:
+//    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
+//    MLevel1ThreadCluster, NLevel1ThreadCluster
+template <index_t BlockSize,
+          typename BlockMatrixA,
+          typename BlockMatrixB,
+          typename ThreadMatrixC,
+          index_t MPerThreadSubC,
+          index_t NPerThreadSubC,
+          index_t KPerThreadLoop,
+          index_t MLevel0ThreadCluster,
+          index_t NLevel0ThreadCluster,
+          index_t MLevel1ThreadCluster,
+          index_t NLevel1ThreadCluster,
+          index_t ThreadGemmADataPerRead_M,
+          index_t ThreadGemmBDataPerRead_N>
+struct BlockwiseGemm_km_kn_m0m1n0n1_v1
+{
+    struct MatrixIndex
+    {
+        index_t row;
+        index_t col;
+    };
+
+    index_t mMyThreadOffsetA;
+    index_t mMyThreadOffsetB;
+
+    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v1()
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t ThreadPerLevel1Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster *
+                                                   MLevel1ThreadCluster * NLevel1ThreadCluster;
+
+        static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");
+
+        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
+                      "wrong! K dimension not consistent\n");
+
+        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
+        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
+
+        static_assert(M % (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster) == 0 &&
+                          N % (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster) == 0,
+                      "wrong! Cannot evenly divide work among\n");
+
+        static_assert(ThreadMatrixC{}.GetLength(I0) == GetThreadMatrixCLengths()[I0] &&
+                          ThreadMatrixC{}.GetLength(I1) == GetThreadMatrixCLengths()[I1],
+                      "wrong! ThreadMatrixC lengths is wrong");
+
+        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.row));
+        mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.col));
+    }
+
+    __device__ static constexpr auto GetThreadMatrixCLengths()
+    {
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
+        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
+
+        constexpr index_t MRepeat =
+            M / (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster);
+        constexpr index_t NRepeat =
+            N / (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster);
+
+        static_assert(M == 128, "wrong!");
+        static_assert(MPerThreadSubC == 4, "wrong!");
+        static_assert(MRepeat == 2, "wrong!");
+        static_assert(NRepeat == 2, "wrong!");
+        static_assert(NPerThreadSubC == 4, "wrong!");
+
+        return Sequence<MRepeat * MPerThreadSubC, NRepeat * NPerThreadSubC>{};
+    }
+
+    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    {
+        constexpr index_t ThreadPerLevel0Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster;
+
+        index_t level1_id   = thread_id / ThreadPerLevel0Cluster;
+        index_t level1_m_id = level1_id / NLevel1ThreadCluster;
+        index_t level1_n_id = level1_id % NLevel1ThreadCluster;
+
+        index_t level0_id   = thread_id % ThreadPerLevel0Cluster;
+        index_t level0_m_id = level0_id / NLevel0ThreadCluster;
+        index_t level0_n_id = level0_id % NLevel0ThreadCluster;
+
+        constexpr index_t MPerLevel0Cluster = MPerThreadSubC * MLevel0ThreadCluster;
+        constexpr index_t NPerLevel0Cluster = NPerThreadSubC * NLevel0ThreadCluster;
+
+        return MatrixIndex{level1_m_id * MPerLevel0Cluster + level0_m_id * MPerThreadSubC,
+                           level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
+    }
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void
+    Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
+
+        constexpr index_t K = a_block_mtx[I0];
+
+        constexpr index_t MPerThread = c_thread_mtx[I0];
+        constexpr index_t NPerThread = c_thread_mtx[I1];
+
+        constexpr index_t MPerLevel1Cluster =
+            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
+        constexpr index_t NPerLevel1Cluster =
+            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
+
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+
+        // thread A, B for GEMM
+        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
+            Number<KPerThreadLoop>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
+            Number<KPerThreadLoop>{}, Number<NPerThread>{});
+
+        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
+
+        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
+                                                                    decltype(a_thread_mtx),
+                                                                    KPerThreadLoop,
+                                                                    MPerThreadSubC,
+                                                                    ThreadGemmADataPerRead_M>{};
+
+        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
+                                                                    decltype(b_thread_mtx),
+                                                                    KPerThreadLoop,
+                                                                    NPerThreadSubC,
+                                                                    ThreadGemmBDataPerRead_N>{};
+
+        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<decltype(a_thread_mtx),
+                                                                    decltype(b_thread_mtx),
+                                                                    decltype(c_thread_mtx)>{};
+#pragma unroll
+        // loop over k
+        for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
+        {
+#pragma unroll
+            // read A
+            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
+            {
+                a_thread_copy.Run(p_a_block +
+                                      a_block_mtx.CalculateOffset(
+                                          make_tuple(k_begin, m_repeat * MPerLevel1Cluster)) +
+                                      mMyThreadOffsetA,
+                                  p_a_thread + a_thread_mtx.CalculateOffset(
+                                                   make_tuple(0, m_repeat * MPerThreadSubC)));
+            }
+
+#pragma unroll
+            // read B
+            for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
+            {
+                b_thread_copy.Run(p_b_block +
+                                      b_block_mtx.CalculateOffset(
+                                          make_tuple(k_begin, n_repeat * NPerLevel1Cluster)) +
+                                      mMyThreadOffsetB,
+                                  p_b_thread + b_thread_mtx.CalculateOffset(
+                                                   make_tuple(0, n_repeat * NPerThreadSubC)));
+            }
+
+            // C += A * B
+            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
+        }
+    }
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void
+    Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
+
+        constexpr auto K = a_block_mtx.GetLength(I0);
+
+        constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
+        constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
+
+        constexpr index_t MPerLevel1Cluster =
+            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
+
+        constexpr index_t NPerLevel1Cluster =
+            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
+
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+
+        static_assert(MRepeat == 2 && NRepeat == 2,
+                      "wrong! inline asm cannot deal with this GEMM config yet");
+
+        // thread A, B
+        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
+
+        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
+
+        // thread A-sub, B-sub
+        constexpr auto a_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
+            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}),
+            make_tuple(Number<MPerThread>{}, Number<1>{}));
+
+        constexpr auto b_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
+            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}),
+            make_tuple(Number<NPerThread>{}, Number<1>{}));
+
+        constexpr auto c_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
+            make_tuple(Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}),
+            make_tuple(Number<NPerThread>{}, Number<1>{}));
+
+        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
+
+        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
+                                                                    decltype(a_thread_mtx),
+                                                                    KPerThreadLoop,
+                                                                    MPerThreadSubC,
+                                                                    ThreadGemmADataPerRead_M>{};
+
+        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
+                                                                    decltype(b_thread_mtx),
+                                                                    KPerThreadLoop,
+                                                                    NPerThreadSubC,
+                                                                    ThreadGemmBDataPerRead_N>{};
+
+        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<decltype(a_thread_sub_mtx),
+                                                                    decltype(b_thread_sub_mtx),
+                                                                    decltype(c_thread_sub_mtx)>{};
+
+        const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA;
+        const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB;
+
+        // read A_sub_0
+        a_thread_copy.Run(p_a_block_off, p_a_thread);
+
+        // read B_sub_0
+        b_thread_copy.Run(p_b_block_off, p_b_thread);
+
+        // read B_sub_1
+        b_thread_copy.Run(p_b_block_off +
+                              b_block_mtx.CalculateOffset(make_tuple(0, NPerLevel1Cluster)),
+                          p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
+
+        // read A_sub_1
+        a_thread_copy.Run(p_a_block_off +
+                              a_block_mtx.CalculateOffset(make_tuple(0, MPerLevel1Cluster)),
+                          p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_gemm.Run(
+            p_a_thread,
+            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
+            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
+
+#pragma unroll
+        // loop over rest of k
+        for(index_t k = KPerThreadLoop; k < K; k += KPerThreadLoop)
+        {
+            // read A_sub_0
+            a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, 0)),
+                              p_a_thread);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_gemm.Run(
+                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
+                p_b_thread,
+                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
+
+            // read B_sub_0
+            b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, 0)),
+                              p_b_thread);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_gemm.Run(
+                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
+                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
+                p_c_thread +
+                    c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
+
+            // read B_sub_1
+            b_thread_copy.Run(
+                p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, NPerLevel1Cluster)),
+                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
+
+            // read A_sub_1
+            a_thread_copy.Run(
+                p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, MPerLevel1Cluster)),
+                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_gemm.Run(
+                p_a_thread,
+                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
+                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
+        }
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_gemm.Run(
+            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
+            p_b_thread,
+            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_gemm.Run(
+            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
+            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
+            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
+    }
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
+    {
+#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t MPerThread = ThreadMatrixC{}.GetLength(I0);
+        constexpr index_t NPerThread = ThreadMatrixC{}.GetLength(I1);
+
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+
+        if constexpr(MRepeat == 2 && NRepeat == 2)
+        {
+            Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
+        }
+        else
+        {
+            Run_naive(p_a_block, p_b_block, p_c_thread);
+        }
+#else
+        Run_naive(p_a_block, p_b_block, p_c_thread);
+#endif
+    }
+};
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -130,12 +130,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        // A matrix in LDS memory, dst of blockwise copy
        //   be careful of LDS alignment
        constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
-            make_multi_index(KPerBlock, MPerBlock), max_lds_align);
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}), Number<max_lds_align>{});

        // B matrix in LDS memory, dst of blockwise copy
        //   be careful of LDS alignment
        constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
-            make_multi_index(KPerBlock, NPerBlock), max_lds_align);
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}), Number<max_lds_align>{});

        // A matrix blockwise copy
        auto a_blockwise_copy =
@@ -201,18 +201,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        //     b_mtx[KPerBlocl, NPerBlock] is in LDS
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
-        constexpr index_t a_k_m_block_mtx_stride =
-            a_k_m_block_desc.CalculateOffset(make_multi_index(1, 0)) -
-            a_k_m_block_desc.CalculateOffset(make_multi_index(0, 0));
-        constexpr index_t b_k_n_block_mtx_stride =
-            b_k_n_block_desc.CalculateOffset(make_multi_index(1, 0)) -
-            b_k_n_block_desc.CalculateOffset(make_multi_index(0, 0));
-
-        constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(
-            Number<KPerBlock>{}, Number<MPerBlock>{}, Number<a_k_m_block_mtx_stride>{});
-        constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(
-            Number<KPerBlock>{}, Number<NPerBlock>{}, Number<b_k_n_block_mtx_stride>{});
-
        // sanity check
        static_assert(MPerBlock % (MPerThread * MLevel0Cluster * MLevel1Cluster) == 0 &&
                          NPerBlock % (NPerThread * NLevel0Cluster * NLevel1Cluster) == 0,
@@ -223,14 +211,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1

        // c_thread_mtx definition: this is a mess
        // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_m0m1_n0n1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<MRepeat * MPerThread>{}, Number<NRepeat * NPerThread>{});
-
-        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
-            BlockSize,
-            decltype(a_k_m_block_mtx_desc),
-            decltype(b_k_n_block_mtx_desc),
-            decltype(c_m0m1_n0n1_thread_mtx_desc),
+        constexpr auto c_m0m1_n0n1_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat * MPerThread>{}, Number<NRepeat * NPerThread>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemm_km_kn_m0m1n0n1_v1<BlockSize,
+                                            decltype(a_k_m_block_desc),
+                                            decltype(b_k_n_block_desc),
+                                            decltype(c_m0m1_n0n1_thread_desc),
                                            MPerThread,
                                            NPerThread,
                                            KPerThread,
@@ -252,10 +240,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
        Float* p_b_block_double = p_shared_block + 2 * a_block_space_size;

        // register allocation for output
-        AccFloat p_c_thread[c_m0m1_n0n1_thread_mtx_desc.GetElementSpace()];
+        AccFloat p_c_thread[c_m0m1_n0n1_thread_desc.GetElementSpaceSize()];

        // zero out threadwise output
-        threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
+        threadwise_matrix_set_zero_v2(c_m0m1_n0n1_thread_desc, p_c_thread);

        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
@@ -422,7 +410,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                AddressSpace::Global,
                CGlobalMemoryDataOperation,
                1,
-                true,
                true>(c_m0_m1_n0_n1_global_desc,
                      make_multi_index(m_thread_data_on_global / M1,
                                       m_thread_data_on_global % M1,

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -44,11 +44,11 @@ template <typename SrcData,
          AddressSpace DstAddressSpace,
          InMemoryDataOperation DstInMemOp,
          index_t DstScalarStrideInVector,
-          bool SrcResetCoordinateAfterRun,
          bool DstResetCoordinateAfterRun>
 struct ThreadwiseDynamicTensorSliceTransfer_v1r3
 {
    static constexpr index_t nDim = SliceLengths::Size();
+
    using Index = MultiIndex<nDim>;

    using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
@@ -61,11 +61,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
    {
    }

-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3()
-        : ThreadwiseDynamicTensorSliceTransfer_v1r3(DstDesc{}, make_zero_multi_index<nDim>())
-    {
-    }
-
    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
    {
        dst_slice_origin_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
@@ -297,7 +292,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
            return forward_sweep;
        }();

-        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // calculate dst data index after last iteration in Run(), if it has not being reset by
        // RunWrite()
        constexpr auto dst_data_idx = [&]() {
            Index ordered_idx;
@@ -328,7 +323,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
                                       const Index& dst_slice_origin_step_idx)
    {
-        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        // if dst coord was not reset by Run(), then need to adjust the step here
        const auto adjusted_step_idx =
            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
@@ -344,6 +339,319 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
    DstCoord dst_slice_origin_coord_;
 }; // namespace ck

+// this version is less likely to have scratch memory issue, due to:
+//   1. It does not keep reference to tensor descriptor
+//   2. It does not construct new tensor coordinate for this->Run()
+// Assume dst_slice_origin_idx is 0
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          index_t SrcScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun>
+struct ThreadwiseDynamicTensorSliceTransfer_v2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                                 const Index& src_slice_origin_idx)
+        : src_slice_origin_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx))
+    {
+    }
+
+    __device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_slice_origin_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    template <typename SrcIteratorHacks>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcData* p_src,
+                        DstData* p_dst,
+                        const SrcIteratorHacks& src_iterator_hacks)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // Comments: dst_desc is constexpr
+        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // make forward iterators
+        const auto src_forward_iterators = generate_tuple(
+            [&](auto i) {
+                Index forward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, forward_step, src_iterator_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward iterators
+        const auto src_backward_iterators = generate_tuple(
+            [&](auto i) {
+                Index backward_step;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_dynamic_tensor_coordinate_iterator(
+                    src_desc, backward_step, src_iterator_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep;
+
+                forward_sweep(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
+                    });
+
+                    forward_sweep(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i]
+                                         ? ordered_access_idx[i]
+                                         : ordered_access_lengths[i] - 1 - ordered_access_idx[i];
+                });
+
+                auto src_data_idx = container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                                    src_scalar_per_access;
+
+                return src_data_idx;
+            }();
+
+            // copy data
+            // hardcoding for buffer_store
+            // TODO refactor transfer_data() to encapsulate this
+            static_assert(DstAddressSpace == AddressSpace::Vgpr, "wrong! hardcode for ds_read");
+
+            vector_type<SrcData, SrcScalarPerVector> src_vector;
+
+            using src_vector_t = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
+
+            if constexpr(SrcAddressSpace == AddressSpace::Global)
+            {
+                src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
+                    p_src,
+                    src_slice_origin_coord_.GetOffset(),
+                    true,
+                    src_desc.GetElementSpaceSize());
+
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_coord_);
+
+                src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};
+            }
+            else
+            {
+                const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    src_desc, src_slice_origin_coord_);
+
+                src_vector.Vector() = is_valid ? *reinterpret_cast<const src_vector_t*>(
+                                                     &p_src[src_slice_origin_coord_.GetOffset()])
+                                               : src_vector_t{0};
+            }
+
+            // this is hardcoded for dst that has compile-time tensor descriptor
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                // assume dst_slice_origin_idx is 0
+                // TODO: support non-zero dst_slice_oring_idx
+                constexpr index_t dst_offset =
+                    dst_desc.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
+
+                p_dst[Number<dst_offset>{}] = src_vector[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_dynamic_tensor_coordinate(src_desc,
+                                                       src_slice_origin_coord_,
+                                                       src_forward_iterators[dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_dynamic_tensor_coordinate(src_desc,
+                                                       src_slice_origin_coord_,
+                                                       src_backward_iterators[dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_iterator =
+                make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
+
+            move_dynamic_tensor_coordinate(src_desc, src_slice_origin_coord_, src_reset_iterator);
+        }
+    }
+
+    __device__ void Run(const SrcDesc& src_desc, const SrcData* p_src, DstData* p_dst)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_iterator_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        Run(src_desc, p_src, p_dst, src_iterator_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep;
+
+            forward_sweep(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
+                });
+
+                forward_sweep(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep;
+        }();
+
+        // calculate src data index after last iteration in Run(), if it has not being reset by
+        // RunWrite()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
+            });
+
+            auto src_data_idx = container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                                src_scalar_per_access;
+
+            return src_data_idx;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step =
+            make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
+
+        move_dynamic_tensor_coordinate(src_desc, src_slice_origin_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_slice_origin_coord_;
+}; // namespace ck
+
 // this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
 // and sometimes useless instructions
 // 1. It does not keep reference to tensor descriptor
@@ -398,12 +706,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
                      "wrong!");
    }

-    __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3()
-        : ThreadwiseDynamicTensorSliceTransfer_v3(
-              SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
-    {
-    }
-
    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
    {
        src_slice_origin_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
@@ -512,7 +814,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3

            vector_type<SrcData, SrcScalarPerVector> src_vector;

-            using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
+            using src_vector_t = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;

 #if 1
            src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
@@ -521,7 +823,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
            const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
                src_desc, src_slice_origin_coord_);

-            src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
+            src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};

            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
                constexpr index_t buffer_offset =

--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -161,5 +161,160 @@ struct ThreadwiseGemmTransANormalBNormalC
    }
 };

+template <typename Float, class Matrix>
+__device__ void threadwise_matrix_set_zero_v2(Matrix, Float* __restrict__ p_thread)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    constexpr auto M = Matrix{}.GetLength(I0);
+    constexpr auto N = Matrix{}.GetLength(I1);
+
+    static_for<0, M, 1>{}([&](auto i) {
+        static_for<0, N, 1>{}([&](auto j) {
+            constexpr auto offset = Matrix{}.CalculateOffset(make_tuple(i, j));
+
+            p_thread[offset] = Float(0);
+        });
+    });
+}
+
+template <typename SrcMatrix,
+          typename DstMatrix,
+          index_t NSliceRow,
+          index_t NSliceCol,
+          index_t DataPerAccess>
+struct ThreadwiseMatrixSliceCopy_v2
+{
+    template <typename Data>
+    __device__ static void Run(const Data* p_src, Data* p_dst)
+    {
+        using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
+
+        static_for<0, NSliceRow, 1>{}([&](auto i) {
+            static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
+                constexpr auto src_offset = SrcMatrix{}.CalculateOffset(make_tuple(i, j));
+                constexpr auto dst_offset = DstMatrix{}.CalculateOffset(make_tuple(i, j));
+
+                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+                    *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+            });
+        });
+    }
+};
+
+// C += transpose(A) * B
+//   Element of matrix can be vectorized data
+template <typename MatrixA, typename MatrixB, typename MatrixC>
+struct ThreadwiseGemm_km_kn_mn_v1
+{
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t M = MatrixC{}[I0];
+        constexpr index_t N = MatrixC{}[I1];
+        constexpr index_t K = MatrixA{}[I0];
+
+        static_for<0, K, 1>{}([&](auto k) {
+            static_for<0, M, 1>{}([&](auto m) {
+                static_for<0, N, 1>{}([&](auto n) {
+                    const index_t a_offset =
+                        MatrixA{}.CalculateOffset(make_tuple(k, m)); // A is transposed
+                    const index_t b_offset = MatrixB{}.CalculateOffset(make_tuple(k, n));
+                    const index_t c_offset = MatrixC{}.CalculateOffset(make_tuple(m, n));
+
+                    p_c[c_offset] +=
+                        inner_product_with_conversion<FloatC>{}(p_a[a_offset], p_b[b_offset]);
+                });
+            });
+        });
+    }
+
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr index_t M = MatrixC{}.GetLength(I0);
+        constexpr index_t N = MatrixC{}.GetLength(I1);
+        constexpr index_t K = MatrixA{}.GetLength(I0);
+
+        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
+
+        static_for<0, K, 1>{}([&](auto k) {
+            static_for<0, M, 1>{}([&](auto m) {
+                constexpr auto a_offset = MatrixA{}.CalculateOffset(make_tuple(k, m));
+
+                if constexpr(N == 2)
+                {
+                    constexpr auto b_offset_0 = MatrixB{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr auto b_offset_1 = MatrixB{}.CalculateOffset(make_tuple(k, I1));
+
+                    constexpr auto c_offset_0 = MatrixC{}.CalculateOffset(make_tuple(m, I0));
+                    constexpr auto c_offset_1 = MatrixC{}.CalculateOffset(make_tuple(m, I1));
+
+                    amd_assembly_outer_product_1x2(p_a[a_offset],
+                                                   p_b[b_offset_0],
+                                                   p_b[b_offset_1],
+                                                   p_c[c_offset_0],
+                                                   p_c[c_offset_1]);
+                }
+                else if constexpr(N == 4)
+                {
+                    constexpr auto b_offset_0 = MatrixB{}.CalculateOffset(make_tuple(k, I0));
+                    constexpr auto b_offset_1 = MatrixB{}.CalculateOffset(make_tuple(k, I1));
+                    constexpr auto b_offset_2 = MatrixB{}.CalculateOffset(make_tuple(k, I2));
+                    constexpr auto b_offset_3 = MatrixB{}.CalculateOffset(make_tuple(k, I3));
+
+                    constexpr auto c_offset_0 = MatrixC{}.CalculateOffset(make_tuple(m, I0));
+                    constexpr auto c_offset_1 = MatrixC{}.CalculateOffset(make_tuple(m, I1));
+                    constexpr auto c_offset_2 = MatrixC{}.CalculateOffset(make_tuple(m, I2));
+                    constexpr auto c_offset_3 = MatrixC{}.CalculateOffset(make_tuple(m, I3));
+
+                    amd_assembly_outer_product_1x4(p_a[a_offset],
+                                                   p_b[b_offset_0],
+                                                   p_b[b_offset_1],
+                                                   p_b[b_offset_2],
+                                                   p_b[b_offset_3],
+                                                   p_c[c_offset_0],
+                                                   p_c[c_offset_1],
+                                                   p_c[c_offset_2],
+                                                   p_c[c_offset_3]);
+                }
+            });
+        });
+    }
+#endif
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+        constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
+                                     ((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
+                                      (is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
+                                      (is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
+
+        if constexpr(has_amd_asm)
+        {
+            Run_amd_asm(p_a, p_b, p_c);
+        }
+        else
+        {
+            Run_source(p_a, p_b, p_c);
+        }
+#else
+        Run_source(p_a, p_b, p_c);
+#endif
+    }
+};
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -114,8 +114,8 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 }

 // greatest common divisor, aka highest common factor
-template <typename X, typename Y>
-__host__ __device__ constexpr auto gcd(X x, Y y)
+template <typename T>
+__host__ __device__ constexpr T gcd(T x, T y)
 {
    if(x == y || x == 0)
    {
@@ -135,6 +135,14 @@ __host__ __device__ constexpr auto gcd(X x, Y y)
    }
 }

+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
+{
+    constexpr auto r = gcd(X, Y);
+
+    return Number<r>{};
+}
+
 template <typename X, typename... Ys>
 __host__ __device__ constexpr auto gcd(X x, Ys... ys)
 {

--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "dummy_dynamic_transform.hpp"
-
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_dummy_dynamic_transform(InDesc,
-                                    const Tensor<T>& in_nchw,
-                                    WeiDesc,
-                                    const Tensor<T>& wei_kcyx,
-                                    OutDesc,
-                                    Tensor<T>& out_nkhw,
-                                    ConvStrides,
-                                    ConvDilations,
-                                    InLeftPads,
-                                    InRightPads,
-                                    ck::index_t nrepeat)
-{
-    using namespace ck;
-
-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
-
-    const auto in_nchw_desc = make_dynamic_naive_tensor_descriptor<4>(
-        to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
-    const auto wei_kcyx_desc = make_dynamic_naive_tensor_descriptor<4>(
-        to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
-    const auto out_nkhw_desc = make_dynamic_naive_tensor_descriptor<4>(
-        to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
-
-    const auto conv_strides   = to_multi_index(ConvStrides{});
-    const auto conv_dilations = to_multi_index(ConvDilations{});
-    const auto in_left_pads   = to_multi_index(InLeftPads{});
-    const auto in_right_pads  = to_multi_index(InRightPads{});
-
-    const auto tensor_descs = map_convolution_into_gemm_fwd_v4r4(wei_kcyx_desc,
-                                                                 in_nchw_desc,
-                                                                 out_nkhw_desc,
-                                                                 conv_strides,
-                                                                 conv_dilations,
-                                                                 in_left_pads,
-                                                                 in_right_pads);
-
-    const auto in_gemmk_gemmn_gemmkpack_global_desc = tensor_descs.At(Number<0>{});
-
-    // test on cpu
-    {
-        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
-            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(0, 0, 1));
-
-        print_array_v2("do_tansforms 0 0 1: ",
-                       in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1.do_transforms_);
-
-        for(index_t iter = 0; iter < 10; ++iter)
-        {
-            printf("iter %d\n", iter);
-            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
-            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
-            printf("\n");
-
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
-        }
-    }
-
-    {
-        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
-            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(0, 1, 0));
-
-        print_array_v2("do_tansforms 0 1 0: ",
-                       in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0.do_transforms_);
-
-        for(index_t iter = 0; iter < 10; ++iter)
-        {
-            printf("iter %d\n", iter);
-            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
-            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
-            printf("\n");
-
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
-        }
-    }
-
-    {
-        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
-            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
-
-        const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
-            make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
-                                                    make_multi_index(1, 0, 0));
-
-        print_array_v2("do_tansforms 1 0 0: ",
-                       in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0.do_transforms_);
-
-        for(index_t iter = 0; iter < 10; ++iter)
-        {
-            printf("iter %d\n", iter);
-            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
-            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
-            printf("\n");
-
-            move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
-                                           in_gemmk_gemmn_gemmkpack_coord,
-                                           in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
-        }
-    }
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GridSize  = 1;
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < 5; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-
-        KernelTimer timer;
-        timer.Start();
-
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-#if 0
-            launch_kernel(run_gridwise_operation<DummyDynamicTransform_1<BlockSize>,
-                                                 index_t* const,
-                                                 float* const,
-                                                 float* const,
-                                                 const decltype(wei_kcyx_desc),
-                                                 const decltype(in_nchw_desc),
-                                                 const decltype(out_nkhw_desc),
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
-                          wei_kcyx_desc,
-                          in_nchw_desc,
-                          out_nkhw_desc,
-                          conv_strides,
-                          conv_dilations,
-                          in_left_pads,
-                          in_right_pads);
-#else
-            launch_kernel(
-                run_gridwise_operation<DummyDynamicTransform_fwd_v4r4<BlockSize>,
-                                       index_t* const,
-                                       float* const,
-                                       float* const,
-                                       const decltype(in_gemmk_gemmn_gemmkpack_global_desc)>,
-                dim3(GridSize),
-                dim3(BlockSize),
-                0,
-                0,
-                static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
-                in_gemmk_gemmn_gemmkpack_global_desc);
-#endif
-        }
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_dummy_dynamic_transform_v1.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v1.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "dummy_dynamic_transform_v1.hpp"
-
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_dummy_dynamic_transform_v1(InDesc,
-                                       const Tensor<T>& in_nchw,
-                                       WeiDesc,
-                                       const Tensor<T>& wei_kcyx,
-                                       OutDesc,
-                                       Tensor<T>& out_nkhw,
-                                       ConvStrides,
-                                       ConvDilations,
-                                       InLeftPads,
-                                       InRightPads,
-                                       ck::index_t nrepeat)
-{
-    using namespace ck;
-
-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
-
-    const auto in_nchw_desc = make_dynamic_naive_tensor_descriptor_v1(
-        to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
-    const auto wei_kcyx_desc = make_dynamic_naive_tensor_descriptor_v1(
-        to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
-    const auto out_nkhw_desc = make_dynamic_naive_tensor_descriptor_v1(
-        to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
-
-    const auto conv_strides   = to_multi_index(ConvStrides{});
-    const auto conv_dilations = to_multi_index(ConvDilations{});
-    const auto in_left_pads   = to_multi_index(InLeftPads{});
-    const auto in_right_pads  = to_multi_index(InRightPads{});
-
-    {
-        const auto tensor_descs = map_convolution_into_gemm_v1(wei_kcyx_desc,
-                                                               in_nchw_desc,
-                                                               out_nkhw_desc,
-                                                               conv_strides,
-                                                               conv_dilations,
-                                                               in_left_pads,
-                                                               in_right_pads);
-
-        const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
-
-        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
-
-        for(index_t iter = 0; iter < 10; ++iter)
-        {
-            constexpr auto gemmk1_gemmn0 = make_multi_index(1, 0);
-
-            printf("iter %d\n", iter);
-
-            print_array("idx0: ", in_gemmk_gemmn_coord.GetIndex());
-            print_array("idx1: ", in_gemmk_gemmn_coord.GetLowerCoordinate().GetIndex());
-            print_array("idx2: ",
-                        in_gemmk_gemmn_coord.GetLowerCoordinate().GetLowerCoordinate().GetIndex());
-            print_array("idx3: ",
-                        in_gemmk_gemmn_coord.GetLowerCoordinate()
-                            .GetLowerCoordinate()
-                            .GetLowerCoordinate()
-                            .GetIndex());
-            print_array("idx4: ",
-                        in_gemmk_gemmn_coord.GetLowerCoordinate()
-                            .GetLowerCoordinate()
-                            .GetLowerCoordinate()
-                            .GetLowerCoordinate()
-                            .GetIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
-
-            printf("\n");
-
-            in_gemmk_gemmn_coord += gemmk1_gemmn0;
-        }
-    }
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GridSize  = 1;
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    using dummy_transform = DummyDynamicTransform_v1<BlockSize>;
-
-    for(index_t i = 0; i < 5; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-
-        KernelTimer timer;
-        timer.Start();
-
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<dummy_transform,
-                                                 index_t* const,
-                                                 float* const,
-                                                 float* const,
-                                                 const DynamicNativeTensorDescriptor_v1<4>,
-                                                 const DynamicNativeTensorDescriptor_v1<4>,
-                                                 const DynamicNativeTensorDescriptor_v1<4>,
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>,
-                                                 const MultiIndex<2>>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
-                          wei_kcyx_desc,
-                          in_nchw_desc,
-                          out_nkhw_desc,
-                          conv_strides,
-                          conv_dilations,
-                          in_left_pads,
-                          in_right_pads);
-        }
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_dummy_static_transform.hpp
+++ b/driver/include/device_dummy_static_transform.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "dummy_static_transform.hpp"
-
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_dummy_static_transform(InDesc,
-                                   const Tensor<T>& in_nchw,
-                                   WeiDesc,
-                                   const Tensor<T>& wei_kcyx,
-                                   OutDesc,
-                                   Tensor<T>& out_nkhw,
-                                   ConvStrides,
-                                   ConvDilations,
-                                   InLeftPads,
-                                   InRightPads,
-                                   ck::index_t nrepeat)
-{
-    using namespace ck;
-
-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc =
-        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
-    constexpr auto wei_kcyx_desc =
-        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
-    constexpr auto out_nkhw_desc =
-        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GridSize  = 1;
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    using dummy_transform = DummyStaticTransform<GridSize,
-                                                 BlockSize,
-                                                 float,
-                                                 decltype(in_nchw_desc),
-                                                 decltype(wei_kcyx_desc),
-                                                 decltype(out_nkhw_desc),
-                                                 ConvStrides,
-                                                 ConvDilations,
-                                                 InLeftPads,
-                                                 InRightPads>;
-
-    for(index_t i = 0; i < 5; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-
-        KernelTimer timer;
-        timer.Start();
-
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<dummy_transform,
-                                                 float* const __restrict__,
-                                                 float* const __restrict__,
-                                                 float* const __restrict__>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()));
-        }
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -14,9 +14,6 @@
 #include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_dummy_static_transform.hpp"
-#include "device_dummy_dynamic_transform_v1.hpp"
-#include "device_dummy_dynamic_transform.hpp"

 int main(int argc, char* argv[])
 {
@@ -616,42 +613,6 @@ int main(int argc, char* argv[])
                                                                         LeftPads{},
                                                                         RightPads{},
                                                                         nrepeat);
-#elif 0
-    device_dummy_static_transform(in_nchw_desc,
-                                  in_nchw,
-                                  wei_kcyx_desc,
-                                  wei_kcyx,
-                                  out_nkhw_desc,
-                                  out_nkhw_device,
-                                  ConvStrides{},
-                                  ConvDilations{},
-                                  LeftPads{},
-                                  RightPads{},
-                                  nrepeat);
-#elif 0
-    device_dummy_dynamic_transform_v1(in_nchw_desc,
-                                      in_nchw,
-                                      wei_kcyx_desc,
-                                      wei_kcyx,
-                                      out_nkhw_desc,
-                                      out_nkhw_device,
-                                      ConvStrides{},
-                                      ConvDilations{},
-                                      LeftPads{},
-                                      RightPads{},
-                                      nrepeat);
-#elif 1
-    device_dummy_dynamic_transform(in_nchw_desc,
-                                   in_nchw,
-                                   wei_kcyx_desc,
-                                   wei_kcyx,
-                                   out_nkhw_desc,
-                                   out_nkhw_device,
-                                   ConvStrides{},
-                                   ConvDilations{},
-                                   LeftPads{},
-                                   RightPads{},
-                                   nrepeat);
 #endif

    if(do_verification)