pipline only one for v1r1

9de99bdb · letaoqin · 936cae25 · 9de99bdb · 9de99bdb
Commit 9de99bdb authored Nov 20, 2023 by letaoqin
2 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
@@ -689,10 +689,7 @@ struct GridwiseMultiHeadFlashAttentionInfer_Xdl_CShuffle
        const auto Q_k = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
        // gridwise GEMM pipeline
        // Only supports LoopScheduler::Default
-        const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_v1r1<
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_v1r1<1>{};
-            NumGemmKPrefetchStage>{}; /*GridwiseGemmPipeline_Selector<PipelineVer,
-                NumGemmKPrefetchStage,
-                LoopScheduler::Default>();*/
        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
@@ -956,7 +953,7 @@ struct GridwiseMultiHeadFlashAttentionInfer_Xdl_CShuffle
                acc_thread_buf,
                num_k_block_main_loop,
                p_shared,
-                gemm1_k_block_outer_index == 0 && Q_k <= 64);
+                gemm1_k_block_outer_index == 0 || Q_k > 64);
            // do MNK padding or upper triangular masking
            if constexpr(MaskOutUpperTriangle || PadN)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1r1.hpp
@@ -115,265 +115,4 @@ struct GridwiseGemmPipeline_v1r1<1>
    }
 };
-// 2-stage prefetch
-template <>
-struct GridwiseGemmPipeline_v1r1<2>
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
-    {
-        // TODO: improve applicability
-        return num_loop % 2 == 0;
-    }
-    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
-    {
-        return (num_loop / 2) > 1;
-    }
-    template <bool HasMainLoop,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename BlockwiseGemm,
-              typename CThreadBuffer>
-    static __device__ void Run(const AGridDesc& a_grid_desc,
-                               const ABlockDesc& a_block_desc,
-                               ABlockTransfer& a_blockwise_copy,
-                               const AGridBuffer& a_grid_buf,
-                               ABlockBuffer& a_block_buf,
-                               const ABlockTransferStep& a_block_copy_step,
-                               const BGridDesc& b_grid_desc,
-                               const BBlockDesc& b_block_desc,
-                               BBlockTransfer& b_blockwise_copy,
-                               const BGridBuffer& b_grid_buf,
-                               BBlockBuffer& b_block_buf,
-                               const BBlockTransferStep& b_block_copy_step,
-                               const BlockwiseGemm& blockwise_gemm,
-                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop)
-    {
-        // preload data into LDS
-        {
-            // Read 0
-            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
-            // Move
-            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-            // Read 1
-            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
-            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
-        }
-        // Initialize C
-        c_thread_buf.Clear();
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            index_t i = 0;
-            do
-            {
-                // Move
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                // Write i
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
-                // Read i+2
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
-                // Sync
-                block_sync_lds();
-                // Gemm i
-                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-                // Sync
-                block_sync_lds();
-                // Move
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                // Write i+1
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
-                // Read i+3
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
-                // Sync
-                block_sync_lds();
-                // Gemm i+1
-                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-                // Sync
-                block_sync_lds();
-                i += 2;
-            } while(i < (num_loop - 2));
-        }
-        // tail
-        {
-            // Write num_loop - 2
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
-            // Sync
-            block_sync_lds();
-            // Gemm num_loop - 2
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-            // Sync
-            block_sync_lds();
-            // Write num_loop - 1
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
-            // Sync
-            block_sync_lds();
-            // Gemm num_loop - 1
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-        }
-    }
-};
-template <index_t NumPrefetch>
-struct GridwiseGemmPipelineInterwave_v1r1;
-template <>
-struct GridwiseGemmPipelineInterwave_v1r1<1>
-{
-    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
-    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
-    {
-        return num_loop > 1;
-    }
-    template <bool HasMainLoop,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename BlockwiseGemm,
-              typename CThreadBuffer>
-    static __device__ void Run(const AGridDesc& a_grid_desc,
-                               const ABlockDesc& a_block_desc,
-                               ABlockTransfer& a_blockwise_copy,
-                               const AGridBuffer& a_grid_buf,
-                               ABlockBuffer& a_block_buf,
-                               const ABlockTransferStep& a_block_copy_step,
-                               const BGridDesc& b_grid_desc,
-                               const BBlockDesc& b_block_desc,
-                               BBlockTransfer& b_blockwise_copy,
-                               const BGridBuffer& b_grid_buf,
-                               BBlockBuffer& b_block_buf,
-                               const BBlockTransferStep& b_block_copy_step,
-                               const BlockwiseGemm& blockwise_gemm,
-                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop)
-    {
-        // preload data into LDS
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-        // Initialize C
-        c_thread_buf.Clear();
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            index_t i = 0;
-            do
-            {
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                block_sync_lds();
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-                // block_sync_lds(); // moved into blockwise_gemm
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-                ++i;
-            } while(i < (num_loop - 1));
-        }
-        // tail
-        {
-            block_sync_lds();
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-        }
-    }
-};
-// Note: 2 stage prefetch not optimized for inter-wave loop scheduler
-template <>
-struct GridwiseGemmPipelineInterwave_v1r1<2> : public GridwiseGemmPipeline_v1r1<2>
-{
-};
-// TODO: deprecate as GridwiseGemmPipeline_Selector covers the functionality
-template <index_t NumPrefetch, LoopScheduler LoopSched>
-constexpr auto GridwiseGemmPipeline_v1r1_Selector()
-{
-    if constexpr(LoopSched == LoopScheduler::Default)
-    {
-        return GridwiseGemmPipeline_v1r1<NumPrefetch>{};
-    }
-    else if constexpr(LoopSched == LoopScheduler::Interwave)
-    {
-        return GridwiseGemmPipelineInterwave_v1r1<NumPrefetch>{};
-    }
-}
 } // namespace ck