refactor

fc7a6c85 · Chao Liu · f3099df8 · fc7a6c85 · fc7a6c85 · fc7a6c85
Commit fc7a6c85 authored Sep 23, 2020 by Chao Liu
6 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
-#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
+#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
-#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
+#define CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
 #include "common_header.hpp"
 #include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_v2.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
-#include "dynamic_tensor_descriptor_helper_v2.hpp"
 #include "dynamic_tensor_coordinate.hpp"
 namespace ck {
 template <typename WeiDesc, typename InDesc, typename OutDesc>
 __host__ __device__ constexpr auto
-map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
+map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
                             const InDesc& in_n_c_hi_wi_global_desc,
                             const OutDesc& out_n_k_ho_wo_global_desc,
                             const Array<index_t, 2> conv_strides,
@@ -82,84 +81,11 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
    return make_tuple(in_gemmk_gemmn_global_desc);
 }
-template <typename WeiDesc, typename InDesc, typename OutDesc>
-__host__ __device__ constexpr auto
-map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
-                             const InDesc& in_n_c_hi_wi_global_desc,
-                             const OutDesc& out_n_k_ho_wo_global_desc,
-                             const Array<index_t, 2> conv_strides,
-                             const Array<index_t, 2> conv_dilations,
-                             const Array<index_t, 2> in_left_pads,
-                             const Array<index_t, 2> in_right_pads)
-{
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
-    const index_t ConvStrideH = conv_strides[0];
-    const index_t ConvStrideW = conv_strides[1];
-    const index_t ConvDilationH = conv_dilations[0];
-    const index_t ConvDilationW = conv_dilations[1];
-    const index_t InLeftPadH  = in_left_pads[0];
-    const index_t InLeftPadW  = in_left_pads[1];
-    const index_t InRightPadH = in_right_pads[0];
-    const index_t InRightPadW = in_right_pads[1];
-    // input tensor
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
-        transform_dynamic_tensor_descriptor_v2(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicLeftPad{Hi, InLeftPadH},
-                       DynamicLeftPad{Wi, InLeftPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-        make_tuple(DynamicPassThrough{N},
-                   DynamicPassThrough{C},
-                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2(
-        in_n_c_hip_wip_global_desc,
-        make_tuple(DynamicPassThrough{N},
-                   DynamicPassThrough{C},
-                   DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
-                   DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
-        in_n_c_y_ho_x_wo_global_desc,
-        make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
-                   DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
-        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    return make_tuple(in_gemmk_gemmn_global_desc);
-}
 template <index_t BlockSize>
-struct DummyDynamicTransform_1
+struct DummyDynamicTransform_v1
 {
    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_0(index_t* const __restrict__ p_wei_global,
+    __device__ void Run_1(index_t* const __restrict__ p_wei_global,
                          float* const __restrict__ p_in_global,
                          float* const __restrict__ p_out_global,
                          const WeiDesc wei_k_c_y_x_global_desc,
@@ -599,7 +525,7 @@ struct DummyDynamicTransform_1
    }
    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_1(index_t* const __restrict__ p_wei_global,
+    __device__ void Run_2(index_t* const __restrict__ p_wei_global,
                          float* const __restrict__ p_in_global,
                          float* const __restrict__ p_out_global,
                          const WeiDesc wei_k_c_y_x_global_desc,
@@ -610,7 +536,8 @@ struct DummyDynamicTransform_1
                          const Array<index_t, 2> in_left_pads,
                          const Array<index_t, 2> in_right_pads) const
    {
-        const auto transformed_tensor_descs = map_convolution_into_gemm(wei_k_c_y_x_global_desc,
+        const auto transformed_tensor_descs =
+            map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
                                         in_n_c_hi_wi_global_desc,
                                         out_n_k_ho_wo_global_desc,
                                         conv_strides,
@@ -662,71 +589,6 @@ struct DummyDynamicTransform_1
        }
    }
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run_2(index_t* const __restrict__ p_wei_global,
-                          float* const __restrict__ p_in_global,
-                          float* const __restrict__ p_out_global,
-                          const WeiDesc wei_k_c_y_x_global_desc,
-                          const InDesc in_n_c_hi_wi_global_desc,
-                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const Array<index_t, 2> conv_strides,
-                          const Array<index_t, 2> conv_dilations,
-                          const Array<index_t, 2> in_left_pads,
-                          const Array<index_t, 2> in_right_pads) const
-    {
-        const auto transformed_tensor_descs =
-            map_convolution_into_gemm_v2(wei_k_c_y_x_global_desc,
-                                         in_n_c_hi_wi_global_desc,
-                                         out_n_k_ho_wo_global_desc,
-                                         conv_strides,
-                                         conv_dilations,
-                                         in_left_pads,
-                                         in_right_pads);
-        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
-        MultiIndex<2> idx;
-        // initialize idx
-        for(index_t i = 0; i < 2; ++i)
-        {
-            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
-        }
-        const index_t niter = p_wei_global[10];
-        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
-        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
-            in_gemmk_gemmn_global_desc, MultiIndex<2>{{1, 0}});
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate_v2(
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
-            // write
-            float value = 1;
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_gemmk_gemmn_coord.GetOffset(),
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
-                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
-        }
-    }
    template <typename WeiDesc, typename InDesc, typename OutDesc>
    __device__ void Run(index_t* const __restrict__ p_wei_global,
                        float* const __restrict__ p_in_global,
@@ -752,153 +614,5 @@ struct DummyDynamicTransform_1
    }
 };
-template <index_t BlockSize>
-struct DummyDynamicTransform_2
-{
-    template <typename WeiDesc, typename InDesc, typename OutDesc>
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        float* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
-                        const WeiDesc wei_k_c_y_x_global_desc,
-                        const InDesc in_n_c_hi_wi_global_desc,
-                        const OutDesc out_n_k_ho_wo_global_desc,
-                        const Array<index_t, 2> conv_strides,
-                        const Array<index_t, 2> conv_dilations,
-                        const Array<index_t, 2> in_left_pads,
-                        const Array<index_t, 2> in_right_pads) const
-    {
-        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
-        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
-        const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
-        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
-        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
-        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
-        const index_t ConvStrideH = conv_strides[0];
-        const index_t ConvStrideW = conv_strides[1];
-        const index_t ConvDilationH = conv_dilations[0];
-        const index_t ConvDilationW = conv_dilations[1];
-        const index_t InLeftPadH  = in_left_pads[0];
-        const index_t InLeftPadW  = in_left_pads[1];
-        const index_t InRightPadH = in_right_pads[0];
-        const index_t InRightPadW = in_right_pads[1];
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicPassThrough{Hi},
-                       DynamicPassThrough{Wi}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-        MultiIndex<4> idx;
-        // initialize idx
-        for(index_t i = 0; i < 4; ++i)
-        {
-            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
-        }
-        const index_t niter = p_wei_global[10];
-        auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);
-        const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
-            in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}});
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, in_coord, in_coord_step);
-            // write
-            float value = 1;
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_coord.GetOffset(),
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                 in_n_c_hip_wip_global_desc, in_coord),
-                             in_n_c_hip_wip_global_desc.GetElementSpaceSize());
-        }
-    }
-};
-template <index_t BlockSize>
-struct DummyDynamicTransform_3
-{
-    template <typename WeiDesc, typename InDesc, typename OutDesc, typename TransformInDesc>
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
-                        float* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
-                        const WeiDesc wei_k_c_y_x_global_desc,
-                        const InDesc in_n_c_hi_wi_global_desc,
-                        const OutDesc out_n_k_ho_wo_global_desc,
-                        const TransformInDesc in_gemmk_gemmn_global_desc,
-                        const Array<index_t, 2> conv_strides,
-                        const Array<index_t, 2> conv_dilations,
-                        const Array<index_t, 2> in_left_pads,
-                        const Array<index_t, 2> in_right_pads) const
-    {
-        MultiIndex<2> idx;
-        // initialize idx
-        for(index_t i = 0; i < 2; ++i)
-        {
-            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
-        }
-        const index_t niter = p_wei_global[10];
-        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
-        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
-            in_gemmk_gemmn_global_desc, MultiIndex<2>{{1, 0}});
-        for(index_t iter = 0; iter < niter; ++iter)
-        {
-            move_dynamic_tensor_coordinate_v2(
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
-            // write
-            float value = 1;
-            transfer_data<float,
-                          1,
-                          AddressSpace::Vgpr,
-                          AddressSpace::Global,
-                          InMemoryDataOperation::Set,
-                          1,
-                          1>(&value,
-                             0,
-                             true,
-                             1,
-                             p_out_global,
-                             in_gemmk_gemmn_coord.GetOffset(),
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
-                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
-                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
-        }
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_V2_HPP
+#define CK_DUMMY_DYNAMIC_TRANSFORM_V2_HPP
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor_v2.hpp"
+#include "dynamic_tensor_descriptor_helper_v2.hpp"
+namespace ck {
+template <typename WeiDesc, typename InDesc, typename OutDesc>
+__host__ __device__ constexpr auto
+map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
+                             const InDesc& in_n_c_hi_wi_global_desc,
+                             const OutDesc& out_n_k_ho_wo_global_desc,
+                             const Array<index_t, 2> conv_strides,
+                             const Array<index_t, 2> conv_dilations,
+                             const Array<index_t, 2> in_left_pads,
+                             const Array<index_t, 2> in_right_pads)
+{
+    const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
+    const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
+    const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
+    const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+    const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
+    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+    const index_t ConvStrideH = conv_strides[0];
+    const index_t ConvStrideW = conv_strides[1];
+    const index_t ConvDilationH = conv_dilations[0];
+    const index_t ConvDilationW = conv_dilations[1];
+    const index_t InLeftPadH  = in_left_pads[0];
+    const index_t InLeftPadW  = in_left_pads[1];
+    const index_t InRightPadH = in_right_pads[0];
+    const index_t InRightPadW = in_right_pads[1];
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
+        transform_dynamic_tensor_descriptor_v2(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicLeftPad{Hi, InLeftPadH},
+                       DynamicLeftPad{Wi, InLeftPadW}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
+        make_tuple(DynamicPassThrough{N},
+                   DynamicPassThrough{C},
+                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
+                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
+    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(DynamicPassThrough{N},
+                   DynamicPassThrough{C},
+                   DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
+                   DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
+        in_n_c_y_ho_x_wo_global_desc,
+        make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
+                   DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
+        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(in_gemmk_gemmn_global_desc);
+}
+template <index_t BlockSize>
+struct DummyDynamicTransform_v2
+{
+    template <typename WeiDesc, typename InDesc, typename OutDesc, typename TransformInDesc>
+    __device__ void Run_1(index_t* const __restrict__ p_wei_global,
+                          float* const __restrict__ p_in_global,
+                          float* const __restrict__ p_out_global,
+                          const WeiDesc wei_k_c_y_x_global_desc,
+                          const InDesc in_n_c_hi_wi_global_desc,
+                          const OutDesc out_n_k_ho_wo_global_desc,
+                          const TransformInDesc /* in_gemmk_gemmn_global_desc */,
+                          const Array<index_t, 2> conv_strides,
+                          const Array<index_t, 2> conv_dilations,
+                          const Array<index_t, 2> in_left_pads,
+                          const Array<index_t, 2> in_right_pads) const
+    {
+        const auto transformed_tensor_descs =
+            map_convolution_into_gemm_v2(move(wei_k_c_y_x_global_desc),
+                                         move(in_n_c_hi_wi_global_desc),
+                                         move(out_n_k_ho_wo_global_desc),
+                                         conv_strides,
+                                         conv_dilations,
+                                         in_left_pads,
+                                         in_right_pads);
+        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
+        MultiIndex<2> idx;
+        // initialize idx
+        for(index_t i = 0; i < 2; ++i)
+        {
+            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
+        }
+        auto in_gemmk_gemmn_coord =
+            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
+        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
+            in_gemmk_gemmn_global_desc, MultiIndex<2>{{1, 0}});
+#pragma unroll
+        for(index_t i = 0; i < 10; ++i)
+        {
+            move_dynamic_tensor_coordinate_v2(
+                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
+            p_out_global[in_gemmk_gemmn_coord.GetOffset()] = 1;
+        }
+    }
+    template <typename WeiDesc, typename InDesc, typename OutDesc, typename TransformInDesc>
+    __device__ void Run_2(index_t* const __restrict__ p_wei_global,
+                          float* const __restrict__ p_in_global,
+                          float* const __restrict__ p_out_global,
+                          const WeiDesc wei_k_c_y_x_global_desc,
+                          const InDesc in_n_c_hi_wi_global_desc,
+                          const OutDesc out_n_k_ho_wo_global_desc,
+                          const TransformInDesc /* in_gemmk_gemmn_global_desc */,
+                          const Array<index_t, 2> conv_strides,
+                          const Array<index_t, 2> conv_dilations,
+                          const Array<index_t, 2> in_left_pads,
+                          const Array<index_t, 2> in_right_pads) const
+    {
+        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
+        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
+        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+        const index_t ConvStrideH = conv_strides[0];
+        const index_t ConvStrideW = conv_strides[1];
+        const index_t ConvDilationH = conv_dilations[0];
+        const index_t ConvDilationW = conv_dilations[1];
+        const index_t InLeftPadH  = in_left_pads[0];
+        const index_t InLeftPadW  = in_left_pads[1];
+        const index_t InRightPadH = in_right_pads[0];
+        const index_t InRightPadW = in_right_pads[1];
+#if 0
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
+            move(in_n_c_hi_wi_global_desc),
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicPassThrough{Hi},
+                       DynamicPassThrough{Wi}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+#elif 0
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
+            move(in_n_c_hi_wi_global_desc),
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicLeftPad{Hi, InLeftPadH},
+                       DynamicLeftPad{Wi, InLeftPadW}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+#else
+    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
+        transform_dynamic_tensor_descriptor_v2(
+            move(in_n_c_hi_wi_global_desc),
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicLeftPad{Hi, InLeftPadH},
+                       DynamicLeftPad{Wi, InLeftPadW}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
+        make_tuple(DynamicPassThrough{N},
+                   DynamicPassThrough{C},
+                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
+                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+#endif
+        MultiIndex<4> idx;
+        // initialize idx
+        for(index_t i = 0; i < 4; ++i)
+        {
+            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
+        }
+#if 0
+        const index_t niter = p_wei_global[10];
+        auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);
+        const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
+            in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}});
+        for(index_t iter = 0; iter < niter; ++iter)
+        {
+            move_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, in_coord, in_coord_step);
+            // write
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(&value,
+                             0,
+                             true,
+                             1,
+                             p_out_global,
+                             in_coord.GetOffset(),
+                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                                 in_n_c_hip_wip_global_desc, in_coord),
+                             in_n_c_hip_wip_global_desc.GetElementSpaceSize());
+        }
+#else
+        // write
+        // auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hi_wi_global_desc, idx);
+        p_out_global[in_n_c_hip_wip_global_desc.CalculateOffset(idx)] = 1;
+#endif
+    }
+    template <typename WeiDesc, typename InDesc, typename OutDesc, typename TransformInDesc>
+    __device__ void Run_3(index_t* const __restrict__ p_wei_global,
+                          float* const __restrict__ p_in_global,
+                          float* const __restrict__ p_out_global,
+                          const WeiDesc /* wei_k_c_y_x_global_desc */,
+                          const InDesc /* in_n_c_hi_wi_global_desc */,
+                          const OutDesc /* out_n_k_ho_wo_global_desc */,
+                          const TransformInDesc in_gemmk_gemmn_global_desc,
+                          const Array<index_t, 2> conv_strides,
+                          const Array<index_t, 2> conv_dilations,
+                          const Array<index_t, 2> in_left_pads,
+                          const Array<index_t, 2> in_right_pads) const
+    {
+        MultiIndex<2> idx;
+        // initialize idx
+        for(index_t i = 0; i < 2; ++i)
+        {
+            idx(i) = p_wei_global[get_thread_local_1d_id() + i];
+        }
+#if 0
+        const index_t niter = p_wei_global[10];
+        auto in_gemmk_gemmn_coord =
+            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
+        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
+            in_gemmk_gemmn_global_desc, MultiIndex<2>{{1, 0}});
+        for(index_t iter = 0; iter < niter; ++iter)
+        {
+            move_dynamic_tensor_coordinate_v2(
+                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
+            // write
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(&value,
+                             0,
+                             true,
+                             1,
+                             p_out_global,
+                             in_gemmk_gemmn_coord.GetOffset(),
+                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
+                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
+        }
+#else
+        p_out_global[in_gemmk_gemmn_global_desc.CalculateOffset(idx)] = 1;
+#endif
+    }
+    template <typename WeiDesc, typename InDesc, typename OutDesc, typename TransformInDesc>
+    __device__ void Run(index_t* const __restrict__ p_wei_global,
+                        float* const __restrict__ p_in_global,
+                        float* const __restrict__ p_out_global,
+                        const WeiDesc wei_k_c_y_x_global_desc,
+                        const InDesc in_n_c_hi_wi_global_desc,
+                        const OutDesc out_n_k_ho_wo_global_desc,
+                        const TransformInDesc in_gemmk_gemmn_global_desc,
+                        const Array<index_t, 2> conv_strides,
+                        const Array<index_t, 2> conv_dilations,
+                        const Array<index_t, 2> in_left_pads,
+                        const Array<index_t, 2> in_right_pads) const
+    {
+        Run_1(p_wei_global,
+              p_in_global,
+              p_out_global,
+              wei_k_c_y_x_global_desc,
+              in_n_c_hi_wi_global_desc,
+              out_n_k_ho_wo_global_desc,
+              in_gemmk_gemmn_global_desc,
+              conv_strides,
+              conv_dilations,
+              in_left_pads,
+              in_right_pads);
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
@@ -125,7 +125,34 @@ struct DynamicTensorDescriptor_v2
    {
        static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
+#if 0 // debug
        return make_dynamic_tensor_coordinate_v2(*this, idx).GetOffset();
+#else
+        constexpr index_t ntransform   = GetNumOfTransform();
+        constexpr index_t ndim_hidden  = GetNumOfHiddenDimension();
+        constexpr index_t ndim_visible = GetNumOfVisibleDimension();
+        constexpr auto visible_dim_ids = GetVisibleDimensionIds();
+        MultiIndex<ndim_hidden> idx_hidden;
+        // initialize visible index
+        auto idx_hidden_pick_visible = pick_array_element(idx_hidden, visible_dim_ids);
+        idx_hidden_pick_visible      = idx;
+        // calculate hidden index
+        static_for<ntransform - 1, -1, -1>{}([this, &idx_hidden](auto itran) {
+            const auto& tran        = this->GetTransforms().At(itran);
+            constexpr auto dims_low = GetLowerDimensionIdss().At(itran);
+            constexpr auto dims_up  = GetUpperDimensionIdss().At(itran);
+            const auto idx_up = pick_array_element(idx_hidden, dims_up);
+            auto idx_low      = pick_array_element(idx_hidden, dims_low);
+            tran.CalculateLowerIndex(idx_low, idx_up);
+        });
+        return idx_hidden[0];
+#endif
    }
    // private:

--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
@@ -2,7 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "gridwise_operation_wrapper.hpp"
-#include "dummy_dynamic_transform.hpp"
+#include "dummy_dynamic_transform_v1.hpp"
 template <class T,
          class InDesc,
@@ -12,7 +12,7 @@ template <class T,
          class ConvDilations,
          class InLeftPads,
          class InRightPads>
-void device_dummy_dynamic_transform_1(InDesc,
+void device_dummy_dynamic_transform_v1(InDesc,
                                       const Tensor<T>& in_nchw,
                                       WeiDesc,
                                       const Tensor<T>& wei_kcyx,
@@ -41,7 +41,7 @@ void device_dummy_dynamic_transform_1(InDesc,
    const auto in_right_pads  = to_array(InRightPads{});
    {
-        const auto tensor_descs = map_convolution_into_gemm(wei_kcyx_desc,
+        const auto tensor_descs = map_convolution_into_gemm_v1(wei_kcyx_desc,
                                                               in_nchw_desc,
                                                               out_nkhw_desc,
                                                               conv_strides,
@@ -97,7 +97,7 @@ void device_dummy_dynamic_transform_1(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    using dummy_transform = DummyDynamicTransform_1<BlockSize>;
+    using dummy_transform = DummyDynamicTransform_v1<BlockSize>;
    for(index_t i = 0; i < 5; ++i)
    {
@@ -138,229 +138,3 @@ void device_dummy_dynamic_transform_1(InDesc,
    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
 }
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_dummy_dynamic_transform_2(InDesc,
-                                      const Tensor<T>& in_nchw,
-                                      WeiDesc,
-                                      const Tensor<T>& wei_kcyx,
-                                      OutDesc,
-                                      Tensor<T>& out_nkhw,
-                                      ConvStrides,
-                                      ConvDilations,
-                                      InLeftPads,
-                                      InRightPads,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
-    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(InDesc::GetLengths()), to_array(InDesc::GetStrides()));
-    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
-    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
-    const auto conv_strides   = to_array(ConvStrides{});
-    const auto conv_dilations = to_array(ConvDilations{});
-    const auto in_left_pads   = to_array(InLeftPads{});
-    const auto in_right_pads  = to_array(InRightPads{});
-    {
-        const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
-                                                               in_nchw_desc,
-                                                               out_nkhw_desc,
-                                                               conv_strides,
-                                                               conv_dilations,
-                                                               in_left_pads,
-                                                               in_right_pads);
-        const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
-        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 0}});
-        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
-            in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 1}});
-        for(index_t iter = 0; iter < 20; ++iter)
-        {
-            printf("iter %d\n", iter);
-            print_array_v2("visible idx: ", in_gemmk_gemmn_coord.GetIndex());
-            print_array_v2("hidden idx: ", in_gemmk_gemmn_coord.GetHiddenIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
-            printf("\n");
-            move_dynamic_tensor_coordinate_v2(
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
-        }
-    }
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GridSize  = 1;
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    using dummy_transform = DummyDynamicTransform_2<BlockSize>;
-    for(index_t i = 0; i < 5; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-        KernelTimer timer;
-        timer.Start();
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<dummy_transform,
-                                                 index_t* const,
-                                                 float* const,
-                                                 float* const,
-                                                 const decltype(wei_kcyx_desc),
-                                                 const decltype(in_nchw_desc),
-                                                 const decltype(out_nkhw_desc),
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
-                          wei_kcyx_desc,
-                          in_nchw_desc,
-                          out_nkhw_desc,
-                          conv_strides,
-                          conv_dilations,
-                          in_left_pads,
-                          in_right_pads);
-        }
-    }
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
-void device_dummy_dynamic_transform_3(InDesc,
-                                      const Tensor<T>& in_nchw,
-                                      WeiDesc,
-                                      const Tensor<T>& wei_kcyx,
-                                      OutDesc,
-                                      Tensor<T>& out_nkhw,
-                                      ConvStrides,
-                                      ConvDilations,
-                                      InLeftPads,
-                                      InRightPads,
-                                      ck::index_t nrepeat)
-{
-    using namespace ck;
-    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
-    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(InDesc::GetLengths()), to_array(InDesc::GetStrides()));
-    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
-    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
-    const auto conv_strides   = to_array(ConvStrides{});
-    const auto conv_dilations = to_array(ConvDilations{});
-    const auto in_left_pads   = to_array(InLeftPads{});
-    const auto in_right_pads  = to_array(InRightPads{});
-    const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
-                                                           in_nchw_desc,
-                                                           out_nkhw_desc,
-                                                           conv_strides,
-                                                           conv_dilations,
-                                                           in_left_pads,
-                                                           in_right_pads);
-    const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GridSize  = 1;
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    using dummy_transform = DummyDynamicTransform_3<BlockSize>;
-    for(index_t i = 0; i < 5; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-        KernelTimer timer;
-        timer.Start();
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<dummy_transform,
-                                                 index_t* const,
-                                                 float* const,
-                                                 float* const,
-                                                 const decltype(wei_kcyx_desc),
-                                                 const decltype(in_nchw_desc),
-                                                 const decltype(out_nkhw_desc),
-                                                 const decltype(in_gemmk_gemmn_global_desc),
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
-                          wei_kcyx_desc,
-                          in_nchw_desc,
-                          out_nkhw_desc,
-                          in_gemmk_gemmn_global_desc,
-                          conv_strides,
-                          conv_dilations,
-                          in_left_pads,
-                          in_right_pads);
-        }
-    }
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_dummy_dynamic_transform_v2.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v2.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "dummy_dynamic_transform_v2.hpp"
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class InLeftPads,
+          class InRightPads>
+void device_dummy_dynamic_transform_v2(InDesc,
+                                       const Tensor<T>& in_nchw,
+                                       WeiDesc,
+                                       const Tensor<T>& wei_kcyx,
+                                       OutDesc,
+                                       Tensor<T>& out_nkhw,
+                                       ConvStrides,
+                                       ConvDilations,
+                                       InLeftPads,
+                                       InRightPads,
+                                       ck::index_t nrepeat)
+{
+    using namespace ck;
+    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
+    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor_v2(
+        to_array(InDesc::GetLengths()), to_array(InDesc::GetStrides()));
+    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor_v2(
+        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
+    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor_v2(
+        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
+    const auto conv_strides   = to_array(ConvStrides{});
+    const auto conv_dilations = to_array(ConvDilations{});
+    const auto in_left_pads   = to_array(InLeftPads{});
+    const auto in_right_pads  = to_array(InRightPads{});
+    const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
+                                                           in_nchw_desc,
+                                                           out_nkhw_desc,
+                                                           conv_strides,
+                                                           conv_dilations,
+                                                           in_left_pads,
+                                                           in_right_pads);
+    const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GridSize  = 1;
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    using dummy_transform = DummyDynamicTransform_v2<BlockSize>;
+    for(index_t i = 0; i < 5; ++i)
+    {
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+        KernelTimer timer;
+        timer.Start();
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<dummy_transform,
+                                                 index_t* const,
+                                                 float* const,
+                                                 float* const,
+                                                 const decltype(wei_kcyx_desc),
+                                                 const decltype(in_nchw_desc),
+                                                 const decltype(out_nkhw_desc),
+                                                 const decltype(in_gemmk_gemmn_global_desc),
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                          wei_kcyx_desc,
+                          in_nchw_desc,
+                          out_nkhw_desc,
+                          in_gemmk_gemmn_global_desc,
+                          conv_strides,
+                          conv_dilations,
+                          in_left_pads,
+                          in_right_pads);
+        }
+    }
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -14,7 +14,8 @@
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dummy_static_transform.hpp"
-#include "device_dummy_dynamic_transform.hpp"
+#include "device_dummy_dynamic_transform_v1.hpp"
+#include "device_dummy_dynamic_transform_v2.hpp"
 int main(int argc, char* argv[])
 {
@@ -585,7 +586,7 @@ int main(int argc, char* argv[])
                                  RightPads{},
                                  nrepeat);
 #elif 0
-    device_dummy_dynamic_transform_1(in_nchw_desc,
+    device_dummy_dynamic_transform_v1(in_nchw_desc,
                                      in_nchw,
                                      wei_kcyx_desc,
                                      wei_kcyx,
@@ -597,19 +598,7 @@ int main(int argc, char* argv[])
                                      RightPads{},
                                      nrepeat);
 #elif 1
-    device_dummy_dynamic_transform_2(in_nchw_desc,
+    device_dummy_dynamic_transform_v2(in_nchw_desc,
-                                     in_nchw,
-                                     wei_kcyx_desc,
-                                     wei_kcyx,
-                                     out_nkhw_desc,
-                                     out_nkhw_device,
-                                     ConvStrides{},
-                                     ConvDilations{},
-                                     LeftPads{},
-                                     RightPads{},
-                                     nrepeat);
-#elif 1
-    device_dummy_dynamic_transform_3(in_nchw_desc,
                                      in_nchw,
                                      wei_kcyx_desc,
                                      wei_kcyx,