Add A/B not use LDS pipeline

6e2c6159 · aska-0096 · 9e1091cd · 6e2c6159 · 6e2c6159
Commit 6e2c6159 authored Apr 27, 2023 by aska-0096
Showing with 101 additions and 7 deletions

example/01_gemm/gemm_wmma_fp16.cpp example/01_gemm/gemm_wmma_fp16.cpp +7 -7

include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp ...k/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +94 -0

No files found.
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
@@ -36,23 +36,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
           CElementOp,    
           GemmDefault, 
           1,           // Prefetch stage
-           128,         // BlockSize
+           256,         // BlockSize
           128,         // MPerBlock
-           128,         // NPerBlock
+           256,         // NPerBlock
           64,          // KPerBlock
           8,           // K1
           16,          // MPerWmma
           16,          // NPerWmma
-           8,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+           4,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
-           2,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
-           S<4, 32, 1>,     
+           S<4, 64, 1>,     
           S<1, 0, 2>,     
           S<1, 0, 2>,              
           2,              
           8,              
           8,      
           true,     
-           S<4, 32, 1>,     
+           S<4, 64, 1>,     
           S<1, 0, 2>,     
           S<1, 0, 2>,             
           2,              
@@ -61,7 +61,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
           true,           
           1,           // C shuffle (M Repeat) Per store
           1,           // C shuffle (N Repeat) Per store
-           S<1, 16, 1,  8>,               
+           S<1, 32, 1,  8>,               
           8>;
 // clang-format on

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -462,6 +462,100 @@ struct GridwiseGemmPipeline_v1<1, true, false>
 template <>
 struct GridwiseGemmPipeline_v1<1, false, false>
 {
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return num_loop > 1;
+    }
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    __device__ static void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0);
+        constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0);
+        auto b_block_buf_switch           = b_block_buf;
+        auto a_block_buf_switch           = a_block_buf;
+        // preload data into LDS
+        a_blockwise_copy.Run(
+            a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf);
+        b_blockwise_copy.Run(
+            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        // Initialize C
+        c_thread_buf.Clear();
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                a_blockwise_copy.Run(
+                    a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf_switch);
+                b_blockwise_copy.Run(
+                    b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch);
+                block_sync_lds();
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                block_sync_lds();
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                a_block_buf = a_block_buf_switch;
+                b_block_buf = b_block_buf_switch;
+                ++i;
+            } while(i < (num_loop - 1));
+        }
+        // tail
+        {
+            block_sync_lds();
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+            block_sync_lds();
+        }
+    }
 };
 template <index_t NumPrefetch>