Fix the perf issue with the better perf than compute pipeline

e5068768 · ThomasNing · bbea596d · e5068768 · e5068768
Commit e5068768 authored Jan 25, 2025 by ThomasNing
Showing with 7 additions and 7 deletions

example/ck_tile/03_gemm/gemm_basic.cpp example/ck_tile/03_gemm/gemm_basic.cpp +3 -3

include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp ...e/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +4 -4

No files found.
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -29,8 +29,8 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
    constexpr int kBlockPerCu = 1;

    // This part comes from the Codegen
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
    constexpr ck_tile::index_t K_Tile = 32;

    constexpr ck_tile::index_t M_Warp = 2;
@@ -69,7 +69,7 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;

    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, true, 3>;
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, true, 2>;
    using CodegenPipelineProblem = ck_tile::
        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
    using CodegenGemmPipeline =

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -149,8 +149,8 @@ struct GemmPipelineAGmemBGmemCRegV1
                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                        const BElementFunction& b_element_func,
                                        index_t num_loop,
-                                        void* p_smem_0,
-                                        void* p_smem_1) 
+                                        void* __restrict__ p_smem_0,
+                                        void* __restrict__ p_smem_1) 
    {
        static_assert(
            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
@@ -363,8 +363,8 @@ struct GemmPipelineAGmemBGmemCRegV1
    CK_TILE_DEVICE static auto run(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                   index_t num_loop,
-                                   void* p_smem_0,
-                                   void* p_smem_1)
+                                   void* __restrict__ p_smem_0,
+                                   void* __restrict__ p_smem_1)
    {
        return run(
            a_dram_block_window_tmp,