opt

b505370e · Jing Zhang · ed966e7f · b505370e · b505370e · b505370e
Commit b505370e authored Sep 14, 2021 by Jing Zhang
4 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -265,6 +265,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+        const index_t n_thread_data_on_global = 0;
        const index_t ho_thread_data_on_global =
            ho_block_data_on_global + ho_thread_id * HoPerThread;
        const index_t wo_thread_data_on_global =
@@ -317,7 +318,12 @@ struct GridwiseGemmDlops_km_kn_mn_v3
            BBlockTransferSrcScalarPerVector,
            BThreadTransferSrcResetCoordinateAfterRun,
            true>(b_e0_e1_n_ho_wo_e2_global_desc,
-                  make_multi_index(0, 0, 0, ho_thread_data_on_global, wo_thread_data_on_global, 0));
+                  make_multi_index(0,
+                                   0,
+                                   n_thread_data_on_global,
+                                   ho_thread_data_on_global,
+                                   wo_thread_data_on_global,
+                                   0));
        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
            p_shared_block, a_e0_e1_k_e2_block_desc.GetElementSpaceSize());
@@ -473,10 +479,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
                                               CThreadTransferDstScalarPerVector,
                                               CGlobalMemoryDataOperation,
                                               1,
-                                               true>(
+                                               true>(c_k_n_ho_wo_global_desc,
-                c_k_n_ho_wo_global_desc,
+                                                     make_multi_index(k_thread_data_on_global,
-                make_multi_index(
+                                                                      n_thread_data_on_global,
-                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                                                                      ho_thread_data_on_global,
+                                                                      wo_thread_data_on_global))
                .Run(c_k_n_ho_wo_thread_desc,
                     make_tuple(I0, I0, I0, I0),
                     c_thread_buf,

--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
@@ -103,7 +103,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
    constexpr index_t EPerThread  = 1;
    using ABlockTransferThreadSliceLengths_E0_E1_K_E2   = Sequence<1, 9, 1, E2>;
-    using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, 2, 16, 1>;
    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;

--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
@@ -10,8 +10,8 @@ template <ck::index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
          typename FloatC,
-          ck::index_t E1,
+          ck::index_t E1_,
-          ck::index_t E2,
+          ck::index_t E2_,
          ck::index_t KPerBlock,
          ck::index_t HoPerBlock,
          ck::index_t WoPerBlock,
@@ -90,6 +90,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
        std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
                  << std::endl;
+        constexpr auto E1 = Number<E1_>{};
+        constexpr auto E2 = Number<E2_>{};
        const auto C0 = C / E2;
        const auto E  = Y * X * C0;
@@ -113,24 +116,27 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
        // input tensor
-        const auto in_n_hip_wip_c_global_desc = transform_tensor_descriptor(
+        const auto in_n_hip_wip_c0_e2_global_desc = transform_tensor_descriptor(
-            in_n_hi_wi_c_global_desc,
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C0, E2)),
            make_tuple(make_pass_through_transform(N),
                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                       make_pass_through_transform(C)),
+                       make_pass_through_transform(C0),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                       make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
        const auto in_n_y_ho_x_wo_c0_e2_global_desc = transform_tensor_descriptor(
-            in_n_hip_wip_c_global_desc,
+            in_n_hip_wip_c0_e2_global_desc,
            make_tuple(
                make_pass_through_transform(N),
                make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
                make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
-                make_unmerge_transform(make_tuple(C0, E2))),
+                make_pass_through_transform(C0),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_pass_through_transform(E2)),
-            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5, 6>{}));
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{}));
        const auto b_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
            in_n_y_ho_x_wo_c0_e2_global_desc,
@@ -187,21 +193,21 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
            Sequence<0, 0, 0, 0, 0, 0, 0>{};
        constexpr auto b_e0_e1_n_ho_wo_e2_global_step_hacks = make_tuple(
-            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
        constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack =
-            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
        // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
        // hack for NKHW format

--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -24,8 +24,8 @@
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
-#define USE_CONV_FWD_V5R1_NHWC 0
+#define USE_CONV_FWD_V5R1_NHWC 1
-#define USE_CONV_FWD_V5R1_NCHWC 1
+#define USE_CONV_FWD_V5R1_NCHWC 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0