tuning

7e87e0b3 · Jing Zhang · f87dddae · 7e87e0b3 · 7e87e0b3 · 7e87e0b3
Commit 7e87e0b3 authored Sep 15, 2021 by Jing Zhang
4 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -114,7 +114,7 @@ template <index_t BlockSize,
          index_t KPerBlock,
          index_t HoPerBlock,
          index_t WoPerBlock,
-          index_t EPerBlock,
+          index_t E1PerBlock,
          index_t KPerThread,
          index_t HoPerThread,
          index_t WoPerThread,
@@ -185,7 +185,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());

-        static_assert(E1 % EPerBlock == 0, "");
+        static_assert(E1 % E1PerBlock == 0, "");

        // const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
        // const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
@@ -229,10 +229,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
            make_tuple(Number<I1>{}, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}),
            max_lds_align);

-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
+        // B matrix in thread, dst of blockwise copy
        constexpr auto b_e1_n_ho_wo_e2_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(Number<EPerBlock>{},
+            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
                                                           Number<1>{},
                                                           Number<HoPerBlock>{},
                                                           Number<WoPerBlock>{},
@@ -244,7 +243,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));

        constexpr auto a_e1_k_e2_block_desc = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);

        auto blockwise_gemm =
            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
@@ -295,16 +294,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                            1,
                                            1,
                                            AThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(a_e0_e1_k_e2_global_desc,
-                                                  make_multi_index(0, 0, k_block_data_on_global, 0),
-                                                  a_e0_e1_k_e2_block_desc,
-                                                  make_multi_index(0, 0, 0, 0));
+                                            false>(
+                a_e0_e1_k_e2_global_desc,
+                make_multi_index(0, 0, k_block_data_on_global, 0),
+                a_e0_e1_k_e2_block_desc,
+                make_multi_index(0, 0, 0, 0));

        constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0);

        constexpr auto b_e0_e1_n_ho_wo_e2_thread_desc =
            make_naive_tensor_descriptor_packed(make_tuple(I1,
-                                                           Number<EPerBlock>{},
+                                                           Number<E1PerBlock>{},
                                                           Number<1>{},
                                                           Number<HoPerThread>{},
                                                           Number<WoPerThread>{},
@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
            FloatAB,
            decltype(b_e0_e1_n_ho_wo_e2_global_desc),
            decltype(b_e0_e1_n_ho_wo_e2_thread_desc),
-            Sequence<I1, EPerBlock, 1, HoPerThread, WoPerThread, E2>,
+            Sequence<I1, E1PerBlock, 1, HoPerThread, WoPerThread, E2>,
            BBlockTransferSrcAccessOrder,
            BBlockTransferSrcVectorDim,
            BBlockTransferSrcScalarPerVector,
@@ -344,7 +344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});

-        constexpr auto b_thread_slice_copy_step = make_multi_index(0, EPerBlock, 0, 0, 0, 0);
+        constexpr auto b_thread_slice_copy_step = make_multi_index(0, E1PerBlock, 0, 0, 0, 0);

        // hack to control index calculation when iterating over A and B matrix for threadwise copy
        constexpr auto a_e0_e1_k_e2_global_step_hacks       = AGlobalStepHacks{};
@@ -407,7 +407,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                        // LDS double buffer: GEMM on current data
                        blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);

-                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

                        b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
                                                                 b_thread_slice_copy_step,
@@ -423,11 +423,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                        // LDS double buffer: GEMM on current data
                        blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);

-                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

-                        e1_block_data_begin += 2 * EPerBlock;
+                        e1_block_data_begin += 2 * E1PerBlock;

-                    } while(e1_block_data_begin < E1 - 2 * EPerBlock);
+                    } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
                }

                // LDS double buffer: tail
@@ -447,7 +447,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                    // LDS double buffer: GEMM on 2nd-last data
                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);

-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

                    // LDS double buffer: GEMM on last data
                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
@@ -462,7 +462,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                                    a_block_slice_copy_step,
                                                    AGlobalMoveSliceWindowStepHacks{});

-                blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0));

                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
                                                         b_thread_slice_copy_step,
@@ -514,7 +514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                    // LDS double buffer: GEMM on current data
                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);

-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

                    b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
                                                             b_thread_slice_copy_step,
@@ -530,11 +530,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                    // LDS double buffer: GEMM on current data
                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);

-                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

-                    e1_block_data_begin += 2 * EPerBlock;
+                    e1_block_data_begin += 2 * E1PerBlock;

-                } while(e1_block_data_begin < E1 - 2 * EPerBlock);
+                } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
            }

            // LDS double buffer: tail
@@ -554,7 +554,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                // LDS double buffer: GEMM on 2nd-last data
                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);

-                blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));

                // LDS double buffer: GEMM on last data
                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);

--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
    const auto Y = wei_k_c_y_x_lengths[I2];
    const auto X = wei_k_c_y_x_lengths[I3];

-    constexpr auto InWeiVectorSize = 4;
+    constexpr auto InWeiVectorSize = 8;

 #if 1
    const auto C0 = C / Number<InWeiVectorSize>{};
@@ -106,9 +106,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
    constexpr index_t HoPerBlock = 8;
    constexpr index_t WoPerBlock = 32;

-    constexpr index_t E1        = 4 * 9;
+    constexpr index_t E1        = 2 * 9;
    constexpr index_t E2        = C1;
-    constexpr index_t EPerBlock = 4;
+    constexpr index_t EPerBlock = 2;

    constexpr index_t KPerThread  = KPerBlock;
    constexpr index_t HoPerThread = 2;

--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -10,8 +10,8 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::index_t E1,
-          ck::index_t E2,
+          ck::index_t E1_,
+          ck::index_t E2_,
          ck::index_t KPerBlock,
          ck::index_t HoPerBlock,
          ck::index_t WoPerBlock,
@@ -95,12 +95,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp

        const auto E = C0 * Y * X;

+        constexpr auto E1 = Number<E1_>{};
+        constexpr auto E2 = Number<E2_>{};
+
        static_assert(E2 == C1, "");

        const auto E0 = E / E1;

        // weight tensor
-        const auto a_e0_k_e2_grid_desc = transform_tensor_descriptor(
+        const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
            make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
            make_tuple(make_pass_through_transform(K),
                       make_pass_through_transform(C0 * Y * X),
@@ -109,7 +112,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
            make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));

        const auto a_e0_e1_k_e2_grid_desc =
-            transform_tensor_descriptor(a_e0_k_e2_grid_desc,
+            transform_tensor_descriptor(a_e_k_e2_grid_desc,
                                        make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
                                                   make_pass_through_transform(K),
                                                   make_pass_through_transform(E2)),
@@ -139,7 +142,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
            make_tuple(
                Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));

-        const auto b_e0_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
+        const auto b_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
            in_n_c0_y_ho_x_wo_c1_global_desc,
            make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
                       make_pass_through_transform(N),
@@ -151,7 +154,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));

        const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
-            b_e0_n_ho_wo_e2_grid_desc,
+            b_e_n_ho_wo_e2_grid_desc,
            make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
                       make_pass_through_transform(N),
                       make_pass_through_transform(Hop),
@@ -199,13 +202,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));

        constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack =
            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
@@ -245,17 +248,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
            ABlockTransferThreadSliceLengths_E0_E1_K_E2,
            ABlockTransferThreadClusterLengths_E0_E1_K_E2,
            Sequence<2, 0, 1, 3>,
-            Sequence<2, 0, 1, 3>,
+            Sequence<0, 1, 2, 3>,
            3,
            ABlockTransferSrcScalarPerVector_E2,
            ABlockTransferDstScalarPerVector_E2,
            false, // don't move back src coordinate after threadwise copy
-            Sequence<0, 2, 3, 4, 1, 5>,
+            Sequence<0, 1, 2, 3, 4, 5>,
            5,
            BThreadTransferSrcScalarPerVector_E2,
            false, // don't move back src coordinate after threadwise copy, which will be fused with
                   // MoveSrcSliceWindow() to save addr computation
-            Sequence<2, 3, 1, 0>,
+            Sequence<0, 1, 2, 3>,
            0,
            CThreadTransferDstScalarPerVector_K,
            decltype(a_e0_e1_k_e2_global_step_hacks),
@@ -286,9 +289,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
        using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo =
            decltype(c_blockid_to_k_n_ho_wo_block_cluster_adaptor);

-#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
        float ave_time = 0;

+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
        if(has_main_k_block_loop && has_double_tail_k_block_loop)
        {
            const auto kernel =
@@ -393,8 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                                              c_k_n_hop_wop_grid_desc,
                                              c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
        }
-
-        return ave_time;
 #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
        DeviceMem a_e0_e1_k_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K_E2));
        DeviceMem b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf(sizeof(BGridDesc_E0_E1_N_Ho_Wo_E2));
@@ -408,8 +409,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
        c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.ToDevice(
            &c_blockid_to_k_n_ho_wo_block_cluster_adaptor);

-        float ave_time = 0;
-
        if(has_main_k_block_loop && has_double_tail_k_block_loop)
        {
            const auto kernel =
@@ -534,9 +533,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
                cast_pointer_to_constant_address_space(
                    c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
        }
-
-        return ave_time;
 #endif
+        return ave_time;
    }
 };
 #endif
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,12 +20,12 @@
 #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"

-#define USE_DYNAMIC_MODE 1
+#define USE_DYNAMIC_MODE 0
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NHWC 1
-#define USE_CONV_FWD_V5R1_NCHWC 0
+#define USE_CONV_FWD_V5R1_NHWC 0
+#define USE_CONV_FWD_V5R1_NCHWC 1
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0

@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
    const bool do_log             = std::stoi(argv[5]);
    const int nrepeat             = std::stoi(argv[6]);

-    constexpr auto N  = Number<128>{};
-    constexpr auto C  = Number<192>{};
-    constexpr auto Hi = Number<71>{};
-    constexpr auto Wi = Number<71>{};
-    constexpr auto K  = Number<256>{};
+    constexpr auto N  = Number<1>{};
+    constexpr auto C  = Number<16>{};
+    constexpr auto Hi = Number<1080>{};
+    constexpr auto Wi = Number<1920>{};
+    constexpr auto K  = Number<16>{};
    constexpr auto Y  = Number<3>{};
    constexpr auto X  = Number<3>{};

-    constexpr auto conv_stride_h   = I2;
-    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
    constexpr auto conv_dilation_h = I1;
    constexpr auto conv_dilation_w = I1;
    constexpr auto in_left_pad_h   = I1;