seperate c2

fd368ca6 · Jing Zhang · 7802381d · fd368ca6 · fd368ca6 · fd368ca6
Commit fd368ca6 authored Sep 13, 2021 by Jing Zhang
6 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -26,6 +26,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
    struct MatrixIndex
    {
        index_t k;
+        index_t n;
        index_t h;
        index_t w;
    };
@@ -104,7 +105,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
        index_t h_thread_id = hw_thread_id / num_w_threads;
        index_t w_thread_id = hw_thread_id % num_w_threads;

-        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+        return MatrixIndex{k_thread_id, 1, h_thread_id, w_thread_id};
    }

    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>

--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -352,73 +352,34 @@ struct GridwiseGemmDlops_km_kn_mn_v3

        index_t e0_block_data_begin = 0;

-        do
+        // do
+        //{
+        // LDS double buffer: preload data
        {
-            // LDS double buffer: preload data
-            {
-                a_blockwise_copy.RunRead(
-                    a_e0_e1_k_e2_global_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
-
-                b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
-                                          b_global_buf,
-                                          b_e0_e1_n_ho_wo_e2_thread_desc,
-                                          make_tuple(I0, I0, I0, I0, I0, I0),
-                                          b_thread_even_buf,
-                                          b_e0_e1_n_ho_wo_e2_global_step_hacks);
-
-                a_blockwise_copy.RunWrite(a_e0_e1_k_e2_block_desc, a_block_buf);
-            }
-
-            __syncthreads();
-
-            if constexpr(HasMainKBlockLoop)
-            {
-                index_t e1_block_data_begin = 0;
+            a_blockwise_copy.RunRead(
+                a_e0_e1_k_e2_global_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);

-                // LDS double buffer: main body
-                // use Do-While loop instead of For loop to simplify control flow
-                do
-                {
-                    // even iteration
-                    b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
-                                                             b_thread_slice_copy_step);
+            b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
+                                      b_global_buf,
+                                      b_e0_e1_n_ho_wo_e2_thread_desc,
+                                      make_tuple(I0, I0, I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e0_e1_n_ho_wo_e2_global_step_hacks);

-                    b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_ho_wo_e2_thread_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0),
-                                              b_thread_odd_buf,
-                                              b_e0_e1_n_ho_wo_e2_global_step_hacks);
-
-                    // LDS double buffer: GEMM on current data
-                    // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
-                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-
-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
-
-                    b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
-                                                             b_thread_slice_copy_step);
-
-                    b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
-                                              b_global_buf,
-                                              b_e0_e1_n_ho_wo_e2_thread_desc,
-                                              make_tuple(I0, I0, I0, I0, I0, I0),
-                                              b_thread_even_buf,
-                                              b_e0_e1_n_ho_wo_e2_global_step_hacks);
-
-                    // LDS double buffer: GEMM on current data
-                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-
-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+            a_blockwise_copy.RunWrite(a_e0_e1_k_e2_block_desc, a_block_buf);
+        }

-                    e1_block_data_begin += 2 * EPerBlock;
+        __syncthreads();

-                } while(e1_block_data_begin < E1 - 2 * EPerBlock);
-            }
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e1_block_data_begin = 0;

-            // LDS double buffer: tail
-            if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
            {
+                // even iteration
                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
                                                         b_thread_slice_copy_step);

@@ -429,32 +390,70 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                          b_thread_odd_buf,
                                          b_e0_e1_n_ho_wo_e2_global_step_hacks);

-                // LDS double buffer: GEMM on 2nd-last data
+                // LDS double buffer: GEMM on current data
                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);

                blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));

-                // LDS double buffer: GEMM on last data
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
+                                          b_global_buf,
+                                          b_e0_e1_n_ho_wo_e2_thread_desc,
+                                          make_tuple(I0, I0, I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e0_e1_n_ho_wo_e2_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
-            }
-            else // if has 1 iteration left
-            {
-                // LDS double buffer: GEMM on last data
-                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
-            }

-            a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k_e2_global_desc,
-                                                a_block_slice_copy_step,
-                                                AGlobalMoveSliceWindowStepHacks{});
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+
+                e1_block_data_begin += 2 * EPerBlock;

-            blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
+            } while(e1_block_data_begin < E1 - 2 * EPerBlock);
+        }

+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
            b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
                                                     b_thread_slice_copy_step);

-            e0_block_data_begin += 1;
+            b_threadwise_transfer.Run(b_e0_e1_n_ho_wo_e2_global_desc,
+                                      b_global_buf,
+                                      b_e0_e1_n_ho_wo_e2_thread_desc,
+                                      make_tuple(I0, I0, I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e0_e1_n_ho_wo_e2_global_step_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+            blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+
+        // a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k_e2_global_desc,
+        // a_block_slice_copy_step,
+        // AGlobalMoveSliceWindowStepHacks{});
+
+        // blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
+
+        // b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
+        // b_thread_slice_copy_step);
+
+        // e0_block_data_begin += 1;

-        } while(e0_block_data_begin < E0);
+        //} while(e0_block_data_begin < E0);

        // output: register to global memory
        {

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -64,41 +64,98 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
        constexpr auto K  = AThreadDesc_E1_K_E2{}.GetLength(I1);
        constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);

-        constexpr auto H = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
-        constexpr auto W = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
+        constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+        constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);

        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});

+        constexpr index_t Vec = 2;
+
        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, H, 1>{}([&](auto h) {
-                static_for<0, W, 1>{}([&](auto w) {
-                    static_for<0, E1, 1>{}([&](auto e) {
+            static_for<0, E1, 1>{}([&](auto e) {
+                static_for<0, Ho, Vec>{}([&](auto h) {
+                    static_for<0, Wo, Vec>{}([&](auto w) {
                        vector_type<FloatA, E2> a_vec;
-                        vector_type<FloatB, E2> b_vec;
+
+                        vector_type<FloatB, E2> b0_vec;
+                        vector_type<FloatB, E2> b1_vec;
+                        vector_type<FloatB, E2> b2_vec;
+                        vector_type<FloatB, E2> b3_vec;

                        static_for<0, E2, 1>{}([&](auto v) {
                            constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
                                a_origin_idx + make_tuple(e, k, v));
-                            constexpr index_t b_offset =
+
+                            constexpr index_t b0_offset =
                                BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
                                    b_origin_idx + make_tuple(e, 0, h, w, v));

+                            constexpr index_t b1_offset =
+                                BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                    b_origin_idx + make_tuple(e, 0, h, w + 1, v));
+
+                            constexpr index_t b2_offset =
+                                BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                    b_origin_idx + make_tuple(e, 0, h + 1, w, v));
+
+                            constexpr index_t b3_offset =
+                                BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                    b_origin_idx + make_tuple(e, 0, h + 1, w + 1, v));
+
                            a_vec.template AsType<FloatA>()(v) = a_buf[Number<a_offset>{}];
-                            b_vec.template AsType<FloatB>()(v) = b_buf[Number<b_offset>{}];
+
+                            b0_vec.template AsType<FloatB>()(v) = b_buf[Number<b0_offset>{}];
+                            b1_vec.template AsType<FloatB>()(v) = b_buf[Number<b1_offset>{}];
+                            b2_vec.template AsType<FloatB>()(v) = b_buf[Number<b2_offset>{}];
+                            b3_vec.template AsType<FloatB>()(v) = b_buf[Number<b3_offset>{}];
                        });

                        using a_vector_t = typename vector_type<FloatA, E2>::type;
                        using b_vector_t = typename vector_type<FloatB, E2>::type;

-                        constexpr index_t c_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                        constexpr index_t c0_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
                            c_origin_idx + make_tuple(k, 0, h, w));

-                        inner_product<a_vector_t, b_vector_t, FloatC>(
-                            a_vec.template AsType<a_vector_t>()[I0],
-                            b_vec.template AsType<b_vector_t>()[I0],
-                            c_buf(Number<c_offset>{}));
+                        constexpr index_t c1_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                            c_origin_idx + make_tuple(k, 0, h, w + 1));
+
+                        constexpr index_t c2_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                            c_origin_idx + make_tuple(k, 0, h + 1, w));
+
+                        constexpr index_t c3_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                            c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
+
+                        amd_assembly_outer_product_1x4(a_vec.template AsType<a_vector_t>()[I0],
+                                                       b0_vec.template AsType<b_vector_t>()[I0],
+                                                       b1_vec.template AsType<b_vector_t>()[I0],
+                                                       b2_vec.template AsType<b_vector_t>()[I0],
+                                                       b3_vec.template AsType<b_vector_t>()[I0],
+                                                       c_buf(Number<c0_offset>{}),
+                                                       c_buf(Number<c1_offset>{}),
+                                                       c_buf(Number<c2_offset>{}),
+                                                       c_buf(Number<c3_offset>{}));
+
+                        // inner_product<a_vector_t, b_vector_t, FloatC>(
+                        // a_vec.template AsType<a_vector_t>()[I0],
+                        // b0_vec.template AsType<b_vector_t>()[I0],
+                        // c_buf(Number<c0_offset>{}));
+
+                        // inner_product<a_vector_t, b_vector_t, FloatC>(
+                        // a_vec.template AsType<a_vector_t>()[I0],
+                        // b1_vec.template AsType<b_vector_t>()[I0],
+                        // c_buf(Number<c1_offset>{}));
+
+                        // inner_product<a_vector_t, b_vector_t, FloatC>(
+                        // a_vec.template AsType<a_vector_t>()[I0],
+                        // b2_vec.template AsType<b_vector_t>()[I0],
+                        // c_buf(Number<c2_offset>{}));
+
+                        // inner_product<a_vector_t, b_vector_t, FloatC>(
+                        // a_vec.template AsType<a_vector_t>()[I0],
+                        // b3_vec.template AsType<b_vector_t>()[I0],
+                        // c_buf(Number<c3_offset>{}));
                    });
                });
            });

--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -49,7 +49,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
    const auto Y = wei_k_c_y_x_lengths[I2];
    const auto X = wei_k_c_y_x_lengths[I3];

-#if 0
+#if 1
    const auto C0 = C / Number<InWeiVectorSize>{};
    const auto C1 = Number<InWeiVectorSize>{};

@@ -105,17 +105,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
    constexpr index_t HoPerBlock = 8;
    constexpr index_t WoPerBlock = 32;

-    constexpr index_t E1        = 4 * 9;
-    constexpr index_t E2        = 4;
-    constexpr index_t EPerBlock = 4;
+    constexpr index_t E1        = 2 * 9;
+    constexpr index_t E2        = 8;
+    constexpr index_t EPerBlock = 2;

    constexpr index_t KPerThread  = KPerBlock;
    constexpr index_t HoPerThread = 2;
    constexpr index_t WoPerThread = 2;
    constexpr index_t EPerThread  = 1;

-    using ABlockTransferThreadSliceLengths_E0_E1_K_E2   = Sequence<1, 9, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, 4, 16, 1>;
+    using ABlockTransferThreadSliceLengths_E0_E1_K_E2   = Sequence<1, 9, 1, 8>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;

    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;

--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -93,23 +93,29 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
                  << std::endl;

-        const auto E  = C0 * Y * X * C1;
-        const auto E0 = E / (E1 * E2);
+        const auto E = C0 * Y * X;
+
+        // static_assert(E % E1 == 0, "");
+        static_assert(E2 == C1, "");
+
+        const auto E0 = E / E1;

        // weight tensor
-        const auto a_e_k_grid_desc = transform_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X * C1)),
+        const auto a_e0_k_e2_grid_desc = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
            make_tuple(make_pass_through_transform(K),
-                       make_pass_through_transform(C0 * Y * X * C1)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<1>{}, Sequence<0>{}));
+                       make_pass_through_transform(C0 * Y * X),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));

        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e_k_grid_desc,
-                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1, E2)),
-                                                   make_pass_through_transform(K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+            transform_tensor_descriptor(a_e0_k_e2_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
+                                                   make_pass_through_transform(K),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));

        // input tensor
        const auto in_n_c0_hip_wip_c1_global_desc = transform_tensor_descriptor(
@@ -118,7 +124,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                       make_pass_through_transform(C0),
                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(C1)),
+                       make_pass_through_transform(E2)),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));

@@ -129,28 +135,32 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                make_pass_through_transform(C0),
                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_pass_through_transform(C1)),
+                make_pass_through_transform(E2)),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
            make_tuple(
                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));

-        const auto b_e_n_ho_wo_grid_desc = transform_tensor_descriptor(
+        const auto b_e0_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
            in_n_c0_y_ho_x_wo_c1_global_desc,
-            make_tuple(make_merge_transform(make_tuple(C0, Y, X, C1)),
+            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
                       make_pass_through_transform(N),
                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop)),
-            make_tuple(Sequence<1, 2, 4, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(
+                Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));

        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            b_e_n_ho_wo_grid_desc,
-            make_tuple(make_unmerge_transform(make_tuple(E0, E1, E2)),
+            b_e0_n_ho_wo_e2_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
                       make_pass_through_transform(N),
                       make_pass_through_transform(Hop),
-                       make_pass_through_transform(Wop)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0, 1, 5>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+                       make_pass_through_transform(Wop),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));

        // output tensor
        const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
@@ -165,40 +175,41 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
        std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;

        if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
-             (E % EPerBlock) == 0))
+             (E1 % EPerBlock) == 0))
        {
            throw std::runtime_error("wrong! GEMM size no divisible");
        }

        // hack to control index calculation when iterating over a_k_m_global tensor
        constexpr auto a_e0_e1_k_e2_global_step_hacks =
-            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}),
-                       make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{},
-                                  Sequence<0, 0, 0, 0, 0>{}));
-
-        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack = Sequence<0, 0, 0, 0, 0>{};
+            make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{}),
+                       make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{},
+                                  Sequence<0, 0, 0, 0, 0, 0, 0>{}));
+
+        constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
+            Sequence<0, 0, 0, 0, 0, 0, 0>{};

        constexpr auto b_e0_e1_n_ho_wo_e2_global_step_hacks = make_tuple(
-            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}),
-            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}));
+            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));

        constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{};
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};

        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
        // hack for NKHW format

--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -11,7 +11,7 @@ cmake
 -D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
 -D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX1030 -O3 --amdgpu-target=gfx1030 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX1030 -O3 --amdgpu-target=gfx1030 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \