use readfirstlane to force result into SGPR to reduce VGPR usage

7733dd88 · Chao Liu · 3b3cfae5 · 7733dd88 · 7733dd88 · 7733dd88
Commit 7733dd88 authored Nov 28, 2020 by Chao Liu
5 changed files
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -423,17 +423,6 @@ struct DynamicMerge
                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-#if 0
-        // I only want to do this check, if idx_diff_up is know at compile-time
-        if(idx_diff_up[Number<0>{}] == 0)
-        {
-            static_for<0, NDimLow, 1>{}([&idx_diff_low](auto i){
-                idx_diff_low(i) = 0;
-            });
-
-            return;
-        }
-#endif
        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
        // However,
        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
@@ -449,7 +438,19 @@ struct DynamicMerge
        // computed at
        //   run-time each time this function is called, and can be very expensive.
        LowerIndex idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
        CalculateLowerIndex(idx_diff_low_const, idx_diff_up);
+#else
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif

        // do carry check on each low dimension in reversed order
        // do not need to check the first dimension

--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
@@ -121,9 +121,9 @@ struct BlockwiseDynamicTensorSliceTransfer_v1r1
    ThreadwiseTransfer threadwise_transfer_;
 };

-// this version is very likely to have scratch memory issue, due to:
+// this version tend to have scratch memory issue, due to:
 // 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
-// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
+// 2. ThreadwiseDynamicTensorSliceTransfer_v1r1::Run() constructs new tensor coordinate
 template <index_t BlockSize,
          typename BlockSrcData,
          typename BlockDstData,
@@ -289,7 +289,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r1

 // this version does following things to avoid scratch memory issue
 // 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
-// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
+// 2. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
          typename BlockSrcData,
          typename BlockDstData,
@@ -465,7 +465,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r2
 // this version does following things to avoid scratch memory issue
 // 1. BlockwiseDynamicTensorSliceTransfer_v2r3 doesn't allocate thread buffer (array) as member
 // 2. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
-// 3. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
+// 3. ThreadwiseDynamicTensorSliceTransfer_v1r2::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
          typename BlockSrcData,
          typename BlockDstData,
@@ -485,7 +485,9 @@ template <index_t BlockSize,
          AddressSpace DstAddressSpace,
          InMemoryDataOperation DstInMemOp,
          index_t SrcDataStride,
-          index_t DstDataStride>
+          index_t DstDataStride,
+          index_t ThreadTransferMoveBackSrcCoord = true,
+          index_t ThreadTransferMoveBackDstCoord = true>
 struct BlockwiseDynamicTensorSliceTransfer_v2r3
 {
    static constexpr index_t nDim =
@@ -607,20 +609,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v2r3
                                                                     AddressSpace::Vgpr,
                                                                     InMemoryDataOperation::Set,
                                                                     SrcDataStride,
-                                                                     1>;
-
-    using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
-                                                                      BlockDstDesc,
-                                                                      ThreadSliceLengths,
-                                                                      DstDimAccessOrder,
-                                                                      DstVectorWriteDim,
-                                                                      1,
-                                                                      DstDataPerWrite,
-                                                                      AddressSpace::Vgpr,
-                                                                      DstAddressSpace,
-                                                                      DstInMemOp,
-                                                                      1,
-                                                                      DstDataStride>;
+                                                                     1,
+                                                                     ThreadTransferMoveBackSrcCoord,
+                                                                     true>;
+
+    using ThreadwiseWrite =
+        ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
+                                                  BlockDstDesc,
+                                                  ThreadSliceLengths,
+                                                  DstDimAccessOrder,
+                                                  DstVectorWriteDim,
+                                                  1,
+                                                  DstDataPerWrite,
+                                                  AddressSpace::Vgpr,
+                                                  DstAddressSpace,
+                                                  DstInMemOp,
+                                                  1,
+                                                  DstDataStride,
+                                                  true,
+                                                  ThreadTransferMoveBackDstCoord>;

    ThreadwiseRead threadwise_read_;
    ThreadwiseWrite threadwise_write_;

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -459,11 +459,24 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
        const index_t N = b_k_n_global_desc.GetLength(I1);

        // divide block work by [M, N]
+#if 0
        const index_t m_block_work_num = M / MPerBlock;
        const index_t n_block_work_num = N / NPerBlock;
+#else
+        // Hack: this force result into SGPR
+        const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(M / MPerBlock);
+        const index_t n_block_work_num = __builtin_amdgcn_readfirstlane(N / NPerBlock);
+#endif

+#if 0
        const index_t m_block_work_id = get_block_1d_id() / n_block_work_num;
        const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t m_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / n_block_work_num);
+        const index_t n_block_work_id = get_block_1d_id() - m_block_work_id * n_block_work_num;
+#endif

        const index_t m_block_data_on_global = m_block_work_id * MPerBlock;
        const index_t n_block_data_on_global = n_block_work_id * NPerBlock;
@@ -505,10 +518,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
                                                     AddressSpace::Lds,
                                                     InMemoryDataOperation::Set,
                                                     1,
-                                                     1>(a_k_m_global_desc,
-                                                        make_multi_index(0, m_block_data_on_global),
-                                                        a_k_m_block_desc,
-                                                        make_multi_index(0, 0));
+                                                     1,
+                                                     true,
+                                                     true>(
+                a_k_m_global_desc,
+                make_multi_index(0, m_block_data_on_global),
+                a_k_m_block_desc,
+                make_multi_index(0, 0));

        // B matrix blockwise copy
        auto b_block_copy =
@@ -531,10 +547,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
                                                     AddressSpace::Lds,
                                                     InMemoryDataOperation::Set,
                                                     1,
-                                                     1>(b_k_n_global_desc,
-                                                        make_multi_index(0, n_block_data_on_global),
-                                                        b_k_n_block_desc,
-                                                        make_multi_index(0, 0));
+                                                     1,
+#if 0
+                                                     true.
+#else
+                                                     false,
+#endif
+                                                     true>(
+                b_k_n_global_desc,
+                make_multi_index(0, n_block_data_on_global),
+                b_k_n_block_desc,
+                make_multi_index(0, 0));

        // GEMM definition
        //   c_mtx += transpose(a_mtx) * b_mtx
@@ -599,7 +622,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1r2
        threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);

        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
+
+#if 0
        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
+#else
+        // HACK: fuse threadwise copy move-back coordinate with move src slice window
+        constexpr auto b_block_slice_copy_step =
+            b_block_copy.threadwise_read_.GetCoordinateStepBack() + make_multi_index(KPerBlock, 0);
+#endif

        // LDS double buffer: preload data into LDS
        {

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -74,6 +74,14 @@
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
 #endif

+// hack: have underlying assumption that need to be satsified, otherwise it's a bug
+// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
+// thread-invariant, otherwise it's a bug
+// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
+#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#endif
+
 // workaround: put all workaround here
 // workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
 #ifndef CK_WORKAROUND_SWDEV_229564