fixed a bug; tested more transformation, no scrach mem

a578ff93 · Chao Liu · c15a3c09 · a578ff93 · a578ff93 · a578ff93
Commit a578ff93 authored Oct 01, 2020 by Chao Liu
8 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
@@ -7,44 +7,44 @@
 namespace ck {
-template <typename WeiDesc, typename InDesc, typename OutDesc>
+template <typename... Wei, typename... In, typename... Out>
-__host__ __device__ constexpr auto
+__host__ __device__ constexpr auto map_convolution_into_gemm_fwd_v4r4(
-map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor_v2<Wei...>& wei_k_c_y_x_global_desc,
-                             const InDesc& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor_v2<In...>& in_n_c_hi_wi_global_desc,
-                             const OutDesc& out_n_k_ho_wo_global_desc,
+    const DynamicTensorDescriptor_v2<Out...>& out_n_k_ho_wo_global_desc,
-                             const MultiIndex<2> conv_strides,
+    const MultiIndex<2> conv_strides,
-                             const MultiIndex<2> conv_dilations,
+    const MultiIndex<2> conv_dilations,
-                             const MultiIndex<2> in_left_pads,
+    const MultiIndex<2> in_left_pads,
-                             const MultiIndex<2> in_right_pads)
+    const MultiIndex<2> in_right_pads)
 {
-    constexpr auto i0 = Number<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto i1 = Number<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto i2 = Number<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto i3 = Number<3>{};
+    constexpr auto I3 = Number<3>{};
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(i0);
+    const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(i1);
+    const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(i1);
+    const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(i2);
+    const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(i3);
+    const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(i2);
+    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(i3);
+    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(i2);
+    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(i3);
+    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
-    const index_t ConvStrideH = conv_strides[i0];
+    const index_t ConvStrideH = conv_strides[I0];
-    const index_t ConvStrideW = conv_strides[i1];
+    const index_t ConvStrideW = conv_strides[I1];
-    const index_t ConvDilationH = conv_dilations[i0];
+    const index_t ConvDilationH = conv_dilations[I0];
-    const index_t ConvDilationW = conv_dilations[i1];
+    const index_t ConvDilationW = conv_dilations[I1];
-    const index_t InLeftPadH  = in_left_pads[i0];
+    const index_t InLeftPadH  = in_left_pads[I0];
-    const index_t InLeftPadW  = in_left_pads[i1];
+    const index_t InLeftPadW  = in_left_pads[I1];
-    const index_t InRightPadH = in_right_pads[i0];
+    const index_t InRightPadH = in_right_pads[I0];
-    const index_t InRightPadW = in_right_pads[i1];
+    const index_t InRightPadW = in_right_pads[I1];
    // input tensor
    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor_v2(
@@ -63,8 +63,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(i2);
+    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(i3);
+    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2(
        in_n_c_hip_wip_global_desc,
@@ -76,16 +76,127 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
+    const auto in_gemmktotal_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
        in_n_c_y_ho_x_wo_global_desc,
        make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
                   DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    return make_tuple(in_gemmk_gemmn_global_desc);
+    const index_t gemmktotal = in_gemmktotal_gemmn_global_desc.GetLength(I0);
+    const index_t gemmn      = in_gemmktotal_gemmn_global_desc.GetLength(I1);
+    constexpr index_t GemmKPack = 8;
+    const index_t gemmk         = gemmktotal / GemmKPack;
+    const auto in_gemmk_gemmn_gemmkpack_global_desc = transform_dynamic_tensor_descriptor_v2(
+        in_gemmktotal_gemmn_global_desc,
+        make_tuple(DynamicUnMerge<2>{make_multi_index(gemmk, GemmKPack)},
+                   DynamicPassThrough{gemmn}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    static_assert(decltype(in_gemmk_gemmn_gemmkpack_global_desc)::GetNumOfDimension() == 3,
+                  "wrong!");
+    return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
 }
+#if 0
+template <typename... Wei, typename... In, typename... Out>
+__host__ __device__ constexpr auto map_convolution_into_gemm_bwd_v4r1(
+    const DynamicTensorDescriptor_v2<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor_v2<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor_v2<Out...>& out_n_k_ho_wo_global_desc,
+    const MultiIndex<2> conv_strides,
+    const MultiIndex<2> conv_dilations,
+    const MultiIndex<2> in_left_pads,
+    const MultiIndex<2> in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
+    const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
+    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+    const index_t ConvStrideH = conv_strides[I0];
+    const index_t ConvStrideW = conv_strides[I1];
+    const index_t ConvDilationH = conv_dilations[I0];
+    const index_t ConvDilationW = conv_dilations[I1];
+    const index_t InLeftPadH  = in_left_pads[I0];
+    const index_t InLeftPadW  = in_left_pads[I1];
+    const index_t InRightPadH = in_right_pads[I0];
+    const index_t InRightPadW = in_right_pads[I1];
+#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
+        constexpr bool out_skip_out_of_bound_check = false;
+#else
+        constexpr bool out_skip_out_of_bound_check = true;
+#endif
+        constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor_v2(
+            out_n_k_ho_wo_global_desc,
+            make_tuple(PassThrough{N},
+                       PassThrough{K},
+                       Embed<2>{make_multi_index(YDot, HTilda), make_multi_index(-ConvDilationH / GcdStrideDilationH, 1)},
+                       Embed<2>{make_multi_index(XDot, WTilda), make_multi_index(-ConvDilationW / GcdStrideDilationW, 1)}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+        constexpr auto out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc =
+            transform_tensor_descriptor_v2(
+                out_n_k_ydot_htilda_xdot_wtilda_global_desc,
+                make_tuple(PassThrough{N},
+                           PassThrough{K},
+                           PassThrough{YDot},
+                           PassThrough{XDot},
+                           Slice<Sequence<HTilda, WTilda>,
+                                 Sequence<iHTildaLeft, iWTildaLeft>,
+                                 Sequence<iHTildaRight, iWTildaRight>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}));
+        constexpr auto out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc =
+            transform_tensor_descriptor(
+                out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc,
+                make_tuple(
+                    PassThrough<N>{},
+                    PassThrough<K>{},
+                    PassThrough<HTildaSlice>{},
+                    PassThrough<WTildaSlice>{},
+                    Slice<Sequence<YDot, XDot>, Sequence<0, 0>, Sequence<YDotSlice, XDotSlice>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}));
+        constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor(
+            out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc,
+            make_tuple(Merge<Sequence<K, YDotSlice, XDotSlice>>{},
+                       Merge<Sequence<N, HTildaSlice, WTildaSlice>>{}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
+}
+#endif
 template <index_t BlockSize>
 struct DummyDynamicTransform_v2_1
 {
@@ -102,13 +213,13 @@ struct DummyDynamicTransform_v2_1
                          const MultiIndex<2> in_right_pads) const
    {
        const auto transformed_tensor_descs =
-            map_convolution_into_gemm_v2(move(wei_k_c_y_x_global_desc),
+            map_convolution_into_gemm(move(wei_k_c_y_x_global_desc),
-                                         move(in_n_c_hi_wi_global_desc),
+                                      move(in_n_c_hi_wi_global_desc),
-                                         move(out_n_k_ho_wo_global_desc),
+                                      move(out_n_k_ho_wo_global_desc),
-                                         conv_strides,
+                                      conv_strides,
-                                         conv_dilations,
+                                      conv_dilations,
-                                         in_left_pads,
+                                      in_left_pads,
-                                         in_right_pads);
+                                      in_right_pads);
        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
@@ -302,31 +413,42 @@ struct DummyDynamicTransform_v2_1
 };
 template <index_t BlockSize>
-struct DummyDynamicTransform_v2_2
+struct DummyDynamicTransform_v2_fwd_v4r4
 {
    template <typename TransformInDesc>
    __device__ void Run(index_t* const __restrict__ p_wei_global,
                        float* const __restrict__ p_in_global,
                        float* const __restrict__ p_out_global,
-                        const TransformInDesc in_gemmk_gemmn_global_desc) const
+                        const TransformInDesc in_gemmk_gemmn_gemmkpack_global_desc) const
    {
-        MultiIndex<2> idx;
+        MultiIndex<3> idx;
        // initialize idx
-        static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
+        static_for<0, 3, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
        const index_t niter = p_wei_global[10];
-        auto in_gemmk_gemmn_coord =
+        auto in_gemmk_gemmn_gemmkpack_coord =
-            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);
+            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc, idx);
-        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_0_0_1 =
-            in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(0, 0, 1));
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_0_1_0 =
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(0, 1, 0));
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_1_0_0 =
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(1, 0, 0));
+        // move (0, 0, 1)
        for(index_t iter = 0; iter < niter; ++iter)
        {
-            move_dynamic_tensor_coordinate_v2(
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_0_0_1);
            // write
            float value = 1;
@@ -337,19 +459,84 @@ struct DummyDynamicTransform_v2_2
                          AddressSpace::Global,
                          InMemoryDataOperation::Set,
                          1,
-                          1>(&value,
+                          1>(
-                             0,
+                &value,
-                             true,
+                0,
-                             1,
+                true,
-                             p_out_global,
+                1,
-                             in_gemmk_gemmn_coord.GetOffset(),
+                p_out_global,
-#if 0
+                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
-                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
+#if 1
-                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
 #else
-                             true,
+                true,
 #endif
-                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
+                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
+        }
+        // move (0, 1, 0)
+        for(index_t iter = 0; iter < niter; ++iter)
+        {
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_0_1_0);
+            // write
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(
+                &value,
+                0,
+                true,
+                1,
+                p_out_global,
+                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
+#if 1
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
+#else
+                true,
+#endif
+                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
+        }
+        // move (1, 0, 0)
+        for(index_t iter = 0; iter < niter; ++iter)
+        {
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_1_0_0);
+            // write
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(
+                &value,
+                0,
+                true,
+                1,
+                p_out_global,
+                in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
+#if 1
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                    in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
+#else
+                true,
+#endif
+                in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
        }
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -371,7 +371,7 @@ struct DynamicMerge
    __host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
        : low_lengths_{low_lengths},
-          low_lengths_scan_{container_reverse_exclusive_scan<index_t, NDimLow>(
+          low_lengths_scan_{container_reverse_exclusive_scan(
              low_lengths, math::multiplies<index_t>{}, index_t{1})},
          up_lengths_{make_multi_index(
              container_reduce(low_lengths, math::multiplies<index_t>(), index_t{1}))}
@@ -536,7 +536,7 @@ struct DynamicUnMerge
    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
                                                           const UpIdx& idx_up) const
    {
-        idx_low(Number<0>{}) = idx_up[Number<NDimUp>{}];
+        idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
        static_for<0, NDimUp - 1, 1>{}(
            [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
@@ -445,7 +445,8 @@ make_dynamic_tensor_coordinate_v2(const TensorDesc& tensor_desc, const VisibleIn
    idx_hidden_pick_visible      = idx_visible;
    // calculate hidden index
-    static_for<ntransform - 1, -1, -1>{}([&tensor_desc, &idx_hidden](auto itran) {
+    static_for<ntransform, 0, -1>{}([&tensor_desc, &idx_hidden](auto itran_p1) {
+        auto itran              = itran_p1 - Number<1>{};
        const auto& tran        = tensor_desc.GetTransforms().At(itran);
        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);

--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -112,11 +112,17 @@ container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData i
    return y;
 }
-template <typename TData, index_t NSize, typename Reduce>
+// Here should use StaticallyIndexedArray<TData, NSize>, instead of Tuple<Xs...>,
-__host__ __device__ constexpr auto container_reverse_exclusive_scan(
+// although the former is the alias of the latter. This is because compiler cannot
-    const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
+// infer the NSize if using StaticallyIndexedArray<TData, NSize>
+// TODO: how to fix this?
+template <typename... Xs, typename Reduce, typename TData>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
 {
-    StaticallyIndexedArray<TData, NSize> y;
+    constexpr index_t NSize = sizeof...(Xs);
+    Tuple<Xs...> y;
    TData r = init;
@@ -130,11 +136,13 @@ __host__ __device__ constexpr auto container_reverse_exclusive_scan(
    return y;
 }
-template <typename TData, index_t NSize, typename Reduce>
+template <typename... Xs, typename Reduce, typename TData>
-__host__ __device__ constexpr auto container_reverse_inclusive_scan(
+__host__ __device__ constexpr auto
-    const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
+container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
 {
-    StaticallyIndexedArray<TData, NSize> y;
+    constexpr index_t NSize = sizeof...(Xs);
+    Tuple<Xs...> y;
    TData r = init;

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -9,7 +9,7 @@
 #define CK_UNSIGNED_INDEX_TYPE 0
 // multi index
-#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 1
+#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
 // device backend
 #define CK_DEVICE_BACKEND_AMD 1

--- a/composable_kernel/include/utility/print.hpp
+++ b/composable_kernel/include/utility/print.hpp
@@ -14,6 +14,7 @@ __host__ __device__ void print_array(const char* s, T a)
    using data_type         = decltype(a.At(Number<0>{}));
    constexpr index_t nsize = a.Size();
+#if 0
    if constexpr(is_same<data_type, uint32_t>{})
    {
        printf("%s size %u, {", s, nsize);
@@ -32,6 +33,11 @@ __host__ __device__ void print_array(const char* s, T a)
        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
        printf("}\n");
    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+    printf("}\n");
+#endif
 }
 template <typename T>
@@ -40,6 +46,7 @@ __host__ __device__ void print_array_v2(const char* s, T a)
    using data_type         = decltype(a.At(Number<0>{}));
    constexpr index_t nsize = a.Size();
+#if 0
    if constexpr(is_same<data_type, uint32_t>{})
    {
        printf("%s size %u, {", s, nsize);
@@ -52,6 +59,11 @@ __host__ __device__ void print_array_v2(const char* s, T a)
        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
        printf("}\n");
    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+    printf("}\n");
+#endif
 }
 } // namespace ck

--- a/driver/include/device_dummy_dynamic_transform_v2.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v2.hpp
@@ -40,36 +40,89 @@ void device_dummy_dynamic_transform_v2(InDesc,
    const auto in_left_pads   = to_multi_index(InLeftPads{});
    const auto in_right_pads  = to_multi_index(InRightPads{});
-    const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
+    const auto tensor_descs = map_convolution_into_gemm_fwd_v4r4(wei_kcyx_desc,
-                                                           in_nchw_desc,
+                                                                 in_nchw_desc,
-                                                           out_nkhw_desc,
+                                                                 out_nkhw_desc,
-                                                           conv_strides,
+                                                                 conv_strides,
-                                                           conv_dilations,
+                                                                 conv_dilations,
-                                                           in_left_pads,
+                                                                 in_left_pads,
-                                                           in_right_pads);
+                                                                 in_right_pads);
-    const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
+    const auto in_gemmk_gemmn_gemmkpack_global_desc = tensor_descs.At(Number<0>{});
    // test on cpu
    {
-        auto in_gemmk_gemmn_coord =
+        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate_v2(
-            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
+            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
-        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_0_0_1 =
-            in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(0, 0, 1));
-        print_array("do_tansforms: ", in_gemmk_gemmn_coord_step.do_transforms_);
+        print_array_v2("do_tansforms 0 0 1: ",
+                       in_gemmk_gemmn_gemmkpack_coord_step_0_0_1.do_transforms_);
        for(index_t iter = 0; iter < 10; ++iter)
        {
            printf("iter %d\n", iter);
-            print_array("idx: ", in_gemmk_gemmn_coord.GetIndex());
+            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
-            print_array("hidden idx: ", in_gemmk_gemmn_coord.GetHiddenIndex());
+            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
-            printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
+            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
            printf("\n");
-            move_dynamic_tensor_coordinate_v2(
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
-                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_0_0_1);
+        }
+    }
+    {
+        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate_v2(
+            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_0_1_0 =
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(0, 1, 0));
+        print_array_v2("do_tansforms 0 1 0: ",
+                       in_gemmk_gemmn_gemmkpack_coord_step_0_1_0.do_transforms_);
+        for(index_t iter = 0; iter < 10; ++iter)
+        {
+            printf("iter %d\n", iter);
+            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
+            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
+            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
+            printf("\n");
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_0_1_0);
+        }
+    }
+    {
+        auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate_v2(
+            in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
+        const auto in_gemmk_gemmn_gemmkpack_coord_step_1_0_0 =
+            make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                                   make_multi_index(1, 0, 0));
+        print_array_v2("do_tansforms 1 0 0: ",
+                       in_gemmk_gemmn_gemmkpack_coord_step_1_0_0.do_transforms_);
+        for(index_t iter = 0; iter < 10; ++iter)
+        {
+            printf("iter %d\n", iter);
+            print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
+            print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
+            printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
+            printf("\n");
+            move_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_gemmkpack_global_desc,
+                                              in_gemmk_gemmn_gemmkpack_coord,
+                                              in_gemmk_gemmn_gemmkpack_coord_step_1_0_0);
        }
    }
@@ -123,19 +176,20 @@ void device_dummy_dynamic_transform_v2(InDesc,
                          in_left_pads,
                          in_right_pads);
 #else
-            launch_kernel(run_gridwise_operation<DummyDynamicTransform_v2_2<BlockSize>,
+            launch_kernel(
-                                                 index_t* const,
+                run_gridwise_operation<DummyDynamicTransform_v2_fwd_v4r4<BlockSize>,
-                                                 float* const,
+                                       index_t* const,
-                                                 float* const,
+                                       float* const,
-                                                 const decltype(in_gemmk_gemmn_global_desc)>,
+                                       float* const,
-                          dim3(GridSize),
+                                       const decltype(in_gemmk_gemmn_gemmkpack_global_desc)>,
-                          dim3(BlockSize),
+                dim3(GridSize),
-                          0,
+                dim3(BlockSize),
-                          0,
+                0,
-                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                0,
-                          static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
+                static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
-                          in_gemmk_gemmn_global_desc);
+                static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                in_gemmk_gemmn_gemmkpack_global_desc);
 #endif
        }
    }

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
 #endif
    }
-#if 1
+#if 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -573,7 +573,7 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
-#elif 1
+#elif 0
    device_dummy_static_transform(in_nchw_desc,
                                  in_nchw,
                                  wei_kcyx_desc,