debugging input load

415b4cbd · root · 91ef99a7 · 415b4cbd · 415b4cbd · 415b4cbd
Commit 415b4cbd authored Mar 15, 2021 by root
3 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -76,7 +76,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
    {
-        return MatrixIndex{1, 8, 8};
+        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
+        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
+        constexpr auto num_w_threads = W / WPerThread;
+        constexpr auto num_h_threads = H / HPerThread;
+        index_t k_thread_id = thread_id / (num_w_threads * num_h_threads);
+        index_t h_thread_id = thread_id / num_w_threads;
+        index_t w_thread_id = thread_id % num_w_threads;
+        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
    }
    template <typename SrcDesc,
@@ -127,10 +138,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
            make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            // make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
+            // make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{},
+            // Number<WPerThread>{}));
            make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}));
        constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
+            // make_tuple(Number<KPerThread>{}, Number<1>{},
+            // Number<HPerThread>{}, Number<WPerThread>{}));
            make_tuple(Number<KPerThread>{}, Number<1>{}));
        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
@@ -147,15 +161,14 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        // loop over k
        for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop)
        {
-#if 0
+#if 1
            a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
                                  mMyThreadOffsetA,
-                              p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0)));
+                              p_a_thread);
 #else
            for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
                p_a_thread[i] = 1;
 #endif
            threadwise_gemm.Run(p_a_thread, p_b_thread + cyx_begin, p_c_thread);
        }
    }

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -94,33 +94,33 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
        // divide block work by [M, N]
 #if 1
-        const auto m_block_work_num   = K / Number<KPerBlock>{};
+        const auto m_block_work_num  = K / Number<KPerBlock>{};
-        const auto nhw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{});
+        const auto hw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{});
-        const index_t k_block_work_id   = get_block_1d_id() / nhw_block_work_num;
+        const index_t k_block_work_id  = get_block_1d_id() / hw_block_work_num;
-        const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num;
+        const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
        constexpr auto h_num_threads = HPerBlock / HPerThread;
        constexpr auto w_num_threads = WPerBlock / WPerThread;
        static_assert(KPerBlock == KPerThread, "");
-        const auto h_thread_id = get_thread_local_1d_id() / h_num_threads;
+        const auto h_thread_id = get_thread_local_1d_id() / w_num_threads;
        const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
 #else
        // Hack: this force result into SGPR
-        const index_t m_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t m_block_work_num  = __builtin_amdgcn_readfirstlane(K / KPerBlock);
-        const index_t nhw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
+        const index_t hw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
        const index_t k_block_work_id =
-            __builtin_amdgcn_readfirstlane(get_block_1d_id() / nhw_block_work_num);
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
-        const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num;
+        const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
 #endif
        const index_t m_block_data_on_global = k_block_work_id * KPerBlock;
-        const index_t h_block_data_on_global = nhw_block_work_id * HPerBlock;
+        const index_t h_block_data_on_global = hw_block_work_id * HPerBlock;
-        const index_t w_block_data_on_global = nhw_block_work_id * WPerBlock;
+        const index_t w_block_data_on_global = hw_block_work_id * WPerBlock;
        // lds max alignment
        constexpr auto max_lds_align =
@@ -166,7 +166,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
                a_cyx_k_block_desc,
                make_multi_index(0, 0));
-#if 1
        constexpr auto b_cyx_n_h_w_thread_desc =
            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
                Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
@@ -191,7 +190,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
            make_multi_index(
                0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id));
-#endif
        // c_thread_mtx definition: this is a mess
        // TODO:: more elegent way of defining c_thread_mtx
        constexpr auto c_k_n_h_w_thread_desc =
@@ -264,20 +262,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
            a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
-#if 0
+#if 1
            __syncthreads();
-            //blockwise_gemm.Run(p_a_block_double, p_b_thread_double, p_c_thread);
            index_t sum = 0;
            for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
-                sum += p_b_thread[i];
+                sum += p_b_thread_double[i];
-            p_c_thread[0] = get_thread_local_1d_id() * 10000 + sum;
+            p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
+            p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
+            p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
 #endif
        }
-#if 1
+#if 0
        if constexpr(HasMainKBlockLoop)
        {
            Float* p_a_block_even = p_a_block_double;
@@ -398,8 +396,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
        {
            // define input tensor descriptor for threadwise copy
            //     thread input tensor, src of threadwise copy
-            constexpr auto c_k_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            constexpr auto c_k_n_h_w_thread_desc =
-                make_tuple(Number<KPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{}));
+                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                    Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
            // calculate origin of thread input tensor on global memory
            //     blockwise GEMM c matrix starting index
@@ -414,8 +413,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
                n_block_data_on_global + c_thread_mtx_on_block.col;
 #endif
            const index_t m_thread_data_on_global = m_block_data_on_global;
-            const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id;
+            const index_t h_thread_data_on_global =
-            const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id;
+                h_block_data_on_global + h_thread_id * HPerThread;
+            const index_t w_thread_data_on_global =
+                w_block_data_on_global + w_thread_id * WPerThread;
            // hack to control index calculation when iterating over c_k_n_h_w_global tensor
            constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{};

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
@@ -73,20 +73,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
    constexpr index_t KPerBlock   = 16;
    constexpr index_t HPerBlock   = 8;
    constexpr index_t WPerBlock   = 8;
-    constexpr index_t CYXPerBlock = 4;
+    constexpr index_t CYXPerBlock = 4 * 3 * 3;
    constexpr index_t KPerThread   = 16;
    constexpr index_t HPerThread   = 1;
    constexpr index_t WPerThread   = 1;
-    constexpr index_t CYXPerThread = 4;
+    constexpr index_t CYXPerThread = 3 * 3;
-    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<1, 1>;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<9, 1>;
    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
-    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<4, 1>;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<36, 1>;
    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;