debug

fb7b4609 · Jing Zhang · 90276e6b · fb7b4609 · fb7b4609 · fb7b4609
Commit fb7b4609 authored Sep 10, 2021 by Jing Zhang
3 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
@@ -181,6 +181,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+        static_assert(E1 % EPerBlock == 0, "");
        // const auto E = a_e0_e1_k_global_desc.GetLength(I0);
        // const auto K = a_e0_e1_k_global_desc.GetLength(I1);

--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
@@ -106,17 +106,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
    constexpr index_t WoPerBlock = 8;
    constexpr index_t E1        = 16;
-    constexpr index_t EPerBlock = 16;
+    constexpr index_t EPerBlock = 8;
    constexpr index_t KPerThread  = KPerBlock;
    constexpr index_t HoPerThread = 1;
    constexpr index_t WoPerThread = 1;
    constexpr index_t EPerThread  = EPerBlock;
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<1, 4, 1>;
+    using ABlockTransferThreadSliceLengths_E0_E1_K   = Sequence<1, 4, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<1, 4, 16>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K = Sequence<1, 4, 16>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E = 4;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
    constexpr index_t BThreadTransferSrcScalarPerVector_E = 1;
@@ -139,8 +139,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
            HoPerThread,
            WoPerThread,
            EPerThread,
-            ABlockTransferThreadSliceLengths_E_K,
+            ABlockTransferThreadSliceLengths_E0_E1_K,
-            ABlockTransferThreadClusterLengths_E_K,
+            ABlockTransferThreadClusterLengths_E0_E1_K,
            ABlockTransferSrcScalarPerVector_E,
            ABlockTransferDstScalarPerVector_K,
            BThreadTransferSrcScalarPerVector_E,

--- a/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+++ b/host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
@@ -256,6 +256,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
        const bool has_double_tail_k_block_loop = (E1 / EPerBlock) % 2 == 0;
+        std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop
+                  << " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop
+                  << std::endl;
        const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(I0, I0))),
                                             make_tuple(Sequence<0, 1>{}),