merge from public repo

09d4c3a4 · illsilin · 171ed358 · 8e4c3fb1 · 09d4c3a4 · 09d4c3a4
Commit 09d4c3a4 authored Oct 01, 2024 by illsilin
20 changed files
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -552,16 +552,33 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }
 #endif
-    auto get_lengths = [&](bool permute,
+    struct
-                           ck_tile::index_t b /*batch*/,
+    {
-                           ck_tile::index_t h /*nhead*/,
+        auto operator()(bool permute,
-                           ck_tile::index_t s /*seqlen*/,
+                        ck_tile::index_t b /*batch*/,
-                           ck_tile::index_t d /*hdim*/) {
+                        ck_tile::index_t h /*nhead*/,
-        if(permute)
+                        ck_tile::index_t s /*seqlen*/,
-            return std::array<ck_tile::index_t, 4>{b, h, s, d};
+                        ck_tile::index_t d /*hdim*/)
-        else
+        {
-            return std::array<ck_tile::index_t, 4>{b, s, h, d};
+            if(permute)
-    };
+                return std::array<ck_tile::index_t, 4>{b, h, s, d};
+            else
+                return std::array<ck_tile::index_t, 4>{b, s, h, d};
+        }
+        auto operator()(bool permute,
+                        ck_tile::index_t ns /*num_splits*/,
+                        ck_tile::index_t b /*batch*/,
+                        ck_tile::index_t h /*nhead*/,
+                        ck_tile::index_t s /*seqlen*/,
+                        ck_tile::index_t d /*hdim*/)
+        {
+            if(permute)
+                return std::array<ck_tile::index_t, 5>{ns, b, h, s, d};
+            else
+                return std::array<ck_tile::index_t, 5>{ns, b, s, h, d};
+        }
+    } get_lengths;
    bool is_v_rowmajor = vlayout == std::string("r");
@@ -617,7 +634,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
    ck_tile::HostTensor<OaccDataType> o_acc_host(
        1 < num_splits || use_kvcache
-            ? std::array<ck_tile::index_t, 5>{num_splits, batch, nhead, max_seqlen_q, hdim_v}
+            ? get_lengths(o_perm, num_splits, shape_batch, nhead, shape_seqlen_q, hdim_v)
            : std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
    // batch mode of lse data layout is [batch, nhead, seqlen_q]
@@ -854,7 +871,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }();
        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
        const ck_tile::index_t stride_randval = (max_seqlen_k);
-        const ck_tile::index_t stride_o_acc   = hdim_v;
+        const ck_tile::index_t stride_o_acc   = (o_perm ? hdim_v : nhead * hdim_v);
        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
        // setup nhead_stride_* arguments
        const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
@@ -881,7 +898,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q;
-        const ck_tile::index_t nhead_stride_o_acc   = (max_seqlen_q * hdim_v);
+        const ck_tile::index_t nhead_stride_o_acc   = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        const ck_tile::index_t nhead_stride_o       = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
        // setup batch_stride_* arguments
        const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q);
@@ -897,12 +914,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q);
-        const ck_tile::index_t batch_stride_o_acc   = (nhead * max_seqlen_q * hdim_v);
+        const ck_tile::index_t batch_stride_o_acc   = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_o       = (nhead * shape_seqlen_q * hdim_v);
        const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch);
        // setup split_stride_* arguments (only used in split-kv kernel)
        const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q);
-        const ck_tile::index_t split_stride_o_acc   = (batch * nhead * max_seqlen_q * hdim_v);
+        const ck_tile::index_t split_stride_o_acc = (shape_batch * nhead * shape_seqlen_q * hdim_v);
        args.q_ptr = q_buf.GetDeviceBuffer();
        args.k_ptr = k_buf.GetDeviceBuffer();

--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -398,10 +398,8 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.nhead_stride_bias,
                                     args.nhead_stride_lse_acc,
                                     args.nhead_stride_o_acc,
-                                     args.batch_stride_k,
+                                     args.batch_stride_k, // only used for paged-kvcache
-                                     args.batch_stride_v,
+                                     args.batch_stride_v, // only used for paged-kvcache
-                                     args.batch_stride_lse_acc,
-                                     args.batch_stride_o_acc,
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc,
                                     args.window_size_left,
@@ -475,7 +473,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.lse_ptr,
                                     args.o_ptr,
                                     args.batch,
-                                     args.max_seqlen_q,
                                     args.seqstart_q_ptr,
                                     args.hdim_v,
                                     args.num_splits,
@@ -486,7 +483,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.nhead_stride_o_acc,
                                     args.nhead_stride_lse,
                                     args.nhead_stride_o,
-                                     args.batch_stride_o_acc,
                                     args.split_stride_lse_acc,
                                     args.split_stride_o_acc);
        }
@@ -497,7 +493,6 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.lse_ptr,
                                     args.o_ptr,
                                     args.batch,
-                                     args.max_seqlen_q,
                                     args.seqlen_q,
                                     args.hdim_v,
                                     args.num_splits,

--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
+set(CMAKE_BUILD_TYPE Debug)
+add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
\ No newline at end of file
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
+# GEMM Matrix Multiplication
+This folder contains example for GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile.
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_gemm_basic -j
+```
+This will result in an executable `build/bin/tile_example_gemm_basic`
+## example
+```
+args:
+          -m    m dimension (default:3328)
+          -n    m dimension (default:4096)
+          -k    k dimension (default:64)
+          -e    epsilon (default:1e-5)
+          -v    cpu validation or not (default:1)
+       -prec    precision (default:fp16)
+```
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "gemm_basic.hpp"
+#include <hip/hip_runtime.h>
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("b", "1", "batch size")
+        .insert("m", "1024", "m dimension")
+        .insert("n", "2048", "n dimension")
+        .insert("k", "64", "k dimension")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("e", "1e-5", "Absolute error tolerance")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "10", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+template <typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          typename PipelineProblem,
+          typename GemmPipeline,
+          typename GemmShape>
+float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
+{
+    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadA = true;
+    constexpr bool kPadB = true;
+    constexpr int kBlockPerCu = 1;
+    using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
+    using GemmEpilogue    = ck_tile::Default2DEpilogue<
+        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>;
+    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+    using Kernel =
+        ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue, LayoutA, LayoutB, LayoutC>;
+    auto kargs = Kernel::MakeKargs(args.p_a,
+                                   args.p_b,
+                                   args.p_c,
+                                   args.epsilon,
+                                   args.M,
+                                   args.N,
+                                   args.K,
+                                   args.stride_A,
+                                   args.stride_B,
+                                   args.stride_C);
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+    constexpr dim3 blocks = Kernel::BlockSize();
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    return ave_time;
+}
+template <typename DataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          typename PipelineProblem,
+          typename GemmPipeline,
+          typename GemmShape>
+float invoke_gemm(ck_tile::DeviceMem& a_buf,
+                  ck_tile::DeviceMem& b_buf,
+                  ck_tile::DeviceMem& c_buf,
+                  const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    if(data_type != DataTypeTraits<DataType>::name)
+    {
+        std::cerr << "Data type mismatch: expected " << DataTypeTraits<DataType>::name << ", got "
+                  << data_type << std::endl;
+        return -1; // Or handle the error appropriately
+    }
+    float epsilon               = arg_parser.get_float("e");
+    ck_tile::index_t batch_size = arg_parser.get_int("b");
+    ck_tile::index_t M          = arg_parser.get_int("m");
+    ck_tile::index_t N          = arg_parser.get_int("n");
+    ck_tile::index_t K          = arg_parser.get_int("k");
+    ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
+    gemm_basic_args args;
+    args.p_a     = a_buf.GetDeviceBuffer();
+    args.p_b     = b_buf.GetDeviceBuffer();
+    args.p_c     = c_buf.GetDeviceBuffer();
+    args.epsilon = epsilon;
+    args.kbatch  = batch_size;
+    args.M       = M;
+    args.N       = N;
+    args.K       = K;
+    // Only set stride_M and stride_N if they are non-zero and not equal to K.
+    if(stride_a != 0)
+    {
+        args.stride_A = stride_a;
+    }
+    else
+    {
+        args.stride_A = [&]() {
+            if constexpr(std::is_same_v<LayoutA, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                return M;
+            }
+            else
+            {
+                return K;
+            }
+        }();
+    }
+    if(stride_b != 0)
+    {
+        args.stride_B = stride_b;
+    }
+    else
+    {
+        args.stride_B = [&]() {
+            if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return N;
+            }
+            else
+            {
+                return K;
+            }
+        }();
+    }
+    if(stride_c != 0)
+    {
+        args.stride_C = stride_c;
+    }
+    else
+    {
+        args.stride_C = [&]() {
+            if constexpr(std::is_same_v<LayoutC, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                return M;
+            }
+            else
+            {
+                return N;
+            }
+        }();
+    }
+    float ave_time = gemm_calc<LayoutA, LayoutB, LayoutC, PipelineProblem, GemmPipeline, GemmShape>(
+        args, ck_tile::stream_config{nullptr, true});
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << "The overall perfomance of the GEMM with "
+              << "[" << data_type << "]"
+              << "batch size: " << batch_size << ". m:" << M << ", n:" << N << ", k:" << K
+              << " is: \n";
+    std::cout << "Running time: " << ave_time << "ms, Throughput " << gb_per_sec << "GB/s \n"
+              << std::flush;
+    return ave_time;
+}
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+    // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
+    using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor;
+    using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor;
+    // host verify
+    std::vector<int> a_dimensions =
+        (std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::RowMajor>)
+            ? std::vector<int>{M, K}
+            : std::vector<int>{K, M};
+    std::vector<int> b_dimensions =
+        (std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            ? std::vector<int>{N, K}
+            : std::vector<int>{K, N};
+    std::vector<int> c_dimensions =
+        (std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::RowMajor>)
+            ? std::vector<int>{M, N}
+            : std::vector<int>{N, M};
+    ck_tile::HostTensor<ADataType> a_host(a_dimensions);
+    ck_tile::HostTensor<BDataType> b_host(b_dimensions);
+    ck_tile::HostTensor<CDataType> c_host_ref(c_dimensions);
+    ck_tile::HostTensor<CDataType> c_host_dev(c_dimensions);
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_host);
+    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_buf(c_host_dev.get_element_space_size_in_bytes());
+    a_buf.ToDevice(a_host.data());
+    b_buf.ToDevice(b_host.data());
+    // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadA = true;
+    constexpr bool kPadB = true;
+    constexpr bool kPadC = true;
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using CodegenPipelineProblem = ck_tile::BlockGemmPipelineProblem<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CodegenGemmShape,
+                                                                     kPadA,
+                                                                     kPadB,
+                                                                     kPadC>;
+    using CodegenGemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    invoke_gemm<ck_tile::half_t,
+                matrix_a_layout,
+                matrix_b_layout,
+                matrix_c_layout,
+                CodegenPipelineProblem,
+                CodegenGemmPipeline,
+                CodegenGemmShape>(a_buf, b_buf, c_buf, arg_parser);
+    c_buf.FromDevice(c_host_dev.data());
+    bool pass_cpu = true;
+    if(arg_parser.get_int("v") == 1)
+    {
+        // ToDo: Will Add the Element Op (bias) verification in the future.
+        ck_tile::reference_gemm<ADataType,
+                                BDataType,
+                                AccDataType,
+                                CDataType,
+                                matrix_a_layout,
+                                matrix_b_layout,
+                                matrix_c_layout>(a_host, b_host, c_host_ref);
+        pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref);
+        std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail")
+                  << std::flush;
+    }
+    bool pass_gpu = true;
+    if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
+        ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
+        ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
+        if(stride_a == 0)
+        {
+            if constexpr(std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                stride_a = M;
+            }
+            else
+            {
+                stride_a = K;
+            }
+        }
+        if(stride_b == 0)
+        {
+            if constexpr(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                stride_b = N;
+            }
+            else
+            {
+                stride_b = K;
+            }
+        }
+        if(stride_c == 0)
+        {
+            if constexpr(std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                stride_c = M;
+            }
+            else
+            {
+                stride_c = N;
+            }
+        }
+        ck_tile::HostTensor<CDataType> c_host_gpu_ref(c_dimensions);
+        ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes());
+        ck_tile::reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType>(
+            a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c);
+        c_buf.FromDevice(c_host_gpu_ref.data());
+        pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref);
+        std::cout << "The GPU veification result is: " << (pass_gpu ? "correct" : "fail")
+                  << std::flush;
+    }
+    std::cout << std::endl << std::flush;
+    return !pass_gpu;
+}
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include <string>
+template <typename DataType>
+struct GemmBasicTypeConfig;
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t; // type convert
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+template <typename T>
+struct DataTypeTraits;
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+struct gemm_basic_args
+{
+    const void* p_a;
+    const void* p_b;
+    void* p_c;
+    float epsilon;
+    ck_tile::index_t kbatch;
+    ck_tile::index_t M;
+    ck_tile::index_t N;
+    ck_tile::index_t K;
+    ck_tile::index_t stride_A;
+    ck_tile::index_t stride_B;
+    ck_tile::index_t stride_C;
+};
+// host API
+float gemm_calc(gemm_basic_args args, const ck_tile::stream_config& s);
--- a/example/ck_tile/03_gemm/script/run_full_test.sh
+++ b/example/ck_tile/03_gemm/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+# get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+# run verification tests
+example/ck_tile/03_gemm/script/smoke_test.sh
+# We do not have a performance benchmark for gemm yet. Will add it in the future.
\ No newline at end of file
--- a/example/ck_tile/03_gemm/script/smoke_test.sh
+++ b/example/ck_tile/03_gemm/script/smoke_test.sh
+#!/bin/bash
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+KNAME=1
+export CK_WARMUP=0
+export CK_REPEAT=1
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+run_fp16_tests() {
+    for batch in 1 2; do
+        for m in 128 1024; do
+            for n in 128 2048; do
+                for k in 32 64; do
+                    $EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS
+                    if [ $? -eq 0 ]; then
+                        echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
+                    else
+                        echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
+                        # Optionally, exit or break if you need to halt further execution
+                        # exit 1
+                    fi
+                done
+            done
+        done
+    done
+}
+set -x
+run_fp16_tests
+set +x
\ No newline at end of file
--- a/example/ck_tile/04_img2col/CMakeLists.txt
+++ b/example/ck_tile/04_img2col/CMakeLists.txt
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)
--- a/example/ck_tile/04_img2col/README.md
+++ b/example/ck_tile/04_img2col/README.md
+# Image to Column
+This folder contains example for Image to Column using ck_tile tile-programming implementation.
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_img2col -j
+```
+This will result in an executable `build/bin/tile_example_img2col`
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <algorithm>
+#include <cstring>
+#include "ck_tile/host.hpp"
+#include "image_to_column.hpp"
+// Host API implementation
+template <>
+float image_to_column(const image_to_column_traits& traits,
+                      const image_to_column_args<2>& args,
+                      const ck_tile::stream_config& stream_conf)
+{
+    if(traits.data_type.compare("fp16") == 0)
+    {
+        constexpr ck_tile::index_t NDimSpatial = 2;
+        constexpr ck_tile::index_t VectorSize  = 8;
+        using thread_tile = ck_tile::sequence<8, 8>;
+        using warp_tile   = ck_tile::sequence<64, 64>;
+        using block_tile  = ck_tile::sequence<128, 128>;
+        using Shape = ck_tile::TileImageToColumnShape<thread_tile, warp_tile, block_tile>;
+        using InDataType  = ck_tile::half_t;
+        using OutDataType = ck_tile::half_t;
+        using PipelineProblem = ck_tile::BlockImageToColumnProblem<InDataType,
+                                                                   OutDataType,
+                                                                   Shape,
+                                                                   NDimSpatial,
+                                                                   VectorSize,
+                                                                   VectorSize>;
+        using Kernel = ck_tile::ImageToColumn<PipelineProblem>;
+        auto kargs = Kernel::MakeKargs(args.p_in,
+                                       args.p_out,
+                                       args.G,
+                                       args.N,
+                                       args.C,
+                                       args.input_spatial_lengths,
+                                       args.filter_spatial_lengths,
+                                       args.output_spatial_lengths,
+                                       args.image_g_n_c_wis_strides,
+                                       args.gemm_g_m_k_strides,
+                                       args.conv_filter_strides,
+                                       args.conv_filter_dilations,
+                                       args.input_left_pads,
+                                       args.input_right_pads);
+        const dim3 grids = Kernel::GridSize(
+            args.N * args.output_spatial_lengths[0] * args.output_spatial_lengths[1],
+            args.filter_spatial_lengths[0] * args.filter_spatial_lengths[1] * args.C,
+            args.G);
+        constexpr dim3 blocks = Kernel::BlockSize();
+        constexpr ck_tile::index_t kBlockPerCu = 2;
+        float ave_time = ck_tile::launch_kernel(
+            stream_conf,
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        return ave_time;
+    }
+    return 0;
+}
+int main(int argc, char* argv[])
+{
+    constexpr ck_tile::index_t NDimSpatial = 2;
+    ExecutionConfig config;
+    ck_tile::conv::ConvParam conv_params = DefaultConvParams;
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatial dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+    using InDataType  = ck_tile::half_t;
+    using OutDataType = ck_tile::half_t;
+    using ImLayout    = ck_tile::tensor_layout::convolution::NHWGC;
+    const auto G = conv_params.G_;
+    const auto N = conv_params.N_;
+    const auto C = conv_params.C_;
+    const ck_tile::long_index_t NHoWo =
+        N * std::accumulate(conv_params.output_spatial_lengths_.begin(),
+                            std::next(conv_params.output_spatial_lengths_.begin(), NDimSpatial),
+                            1,
+                            std::multiplies<>());
+    const ck_tile::long_index_t CYX =
+        C * std::accumulate(conv_params.filter_spatial_lengths_.begin(),
+                            std::next(conv_params.filter_spatial_lengths_.begin(), NDimSpatial),
+                            1,
+                            std::multiplies<>());
+    const auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
+    const auto out_desc = ck_tile::HostTensorDescriptor({G, NHoWo, CYX});
+    // host verify
+    ck_tile::HostTensor<InDataType> in(in_desc);
+    ck_tile::HostTensor<OutDataType> out_device(out_desc);
+    ck_tile::HostTensor<OutDataType> out_host(out_desc);
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1: ck_tile::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in); break;
+    default: ck_tile::FillUniformDistribution<InDataType>{-0.5, 0.5}(in); break;
+    }
+    ck_tile::DeviceMem in_device_buf(in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem out_device_buf(out_device.get_element_space_size_in_bytes());
+    in_device_buf.ToDevice(in.data());
+    image_to_column_traits traits{"fp16"};
+    image_to_column_args<NDimSpatial> args{
+        in_device_buf.GetDeviceBuffer(),
+        out_device_buf.GetDeviceBuffer(),
+        G,
+        N,
+        C,
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.filter_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.output_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial + 3>(in_desc.get_strides()),
+        ck_tile::to_array<ck_tile::long_index_t, 3>(out_desc.get_strides()),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.conv_filter_strides_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.conv_filter_dilations_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_left_pads_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_right_pads_)};
+    float ave_time =
+        image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});
+    std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
+    float gb_per_sec      = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    bool pass = true;
+    if(config.do_verification)
+    {
+        // reference
+        ck_tile::reference_im2col<InDataType, OutDataType, NDimSpatial>(in, out_host, conv_params);
+        out_device_buf.FromDevice(out_device.data());
+        pass = ck_tile::check_err(out_device, out_host);
+        std::cout << "valid:" << (pass ? "y" : "n") << std::endl;
+    }
+    return !pass;
+}
--- a/example/ck_tile/04_img2col/image_to_column.hpp
+++ b/example/ck_tile/04_img2col/image_to_column.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/image_to_column.hpp"
+#include <string>
+#define DefaultConvParams                                                    \
+    ck_tile::conv::ConvParam                                                 \
+    {                                                                        \
+        2, 2, 32, 32, 32, {4, 4}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, { 0, 0 } \
+    }
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck_tile::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck_tile::conv::ConvParam& conv_params)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+    if(argc == 1)
+    {
+        // use default
+        config = ExecutionConfig{};
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        const ck_tile::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_params =
+            ck_tile::conv::parse_conv_param(num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+    return true;
+}
+struct image_to_column_traits
+{
+    std::string data_type;
+};
+template <ck_tile::index_t NDimSpatial>
+struct image_to_column_args
+{
+    const void* p_in;
+    void* p_out;
+    const ck_tile::long_index_t G;
+    const ck_tile::long_index_t N;
+    const ck_tile::long_index_t C;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> filter_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> output_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial + 3> image_g_n_c_wis_strides;
+    const ck_tile::array<ck_tile::long_index_t, 3> gemm_g_m_k_strides;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> conv_filter_strides;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> conv_filter_dilations;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_left_pads;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_right_pads;
+};
+// host API
+template <ck_tile::index_t NDimSpatial>
+float image_to_column(const image_to_column_traits&,
+                      const image_to_column_args<NDimSpatial>&,
+                      const ck_tile::stream_config&);
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -4,3 +4,5 @@ include_directories(AFTER
 add_subdirectory(01_fmha)
 add_subdirectory(02_layernorm2d)
+add_subdirectory(03_gemm)
+add_subdirectory(04_img2col)
--- a/include/ck/filesystem.hpp
+++ b/include/ck/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef GUARD_CK_FILESYSTEM_HPP_
+#define GUARD_CK_FILESYSTEM_HPP_
+#include <string>
+#include <string_view>
+// clang-format off
+#if defined(CPPCHECK)
+  #define CK_HAS_FILESYSTEM 1
+  #define CK_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define CK_HAS_FILESYSTEM 1
+    #define CK_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define CK_HAS_FILESYSTEM 0
+    #define CK_HAS_FILESYSTEM_TS 1
+  #else
+    #define CK_HAS_FILESYSTEM 0
+    #define CK_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define CK_HAS_FILESYSTEM 1
+  #else
+    #define CK_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define CK_HAS_FILESYSTEM_TS 1
+  #else
+    #define CK_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define CK_HAS_FILESYSTEM 0
+  #define CK_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+#if CK_HAS_FILESYSTEM
+#include <filesystem>
+#elif CK_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+namespace CK {
+#if CK_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif CK_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+} // namespace CK
+inline std::string operator+(const std::string_view s, const CK::fs::path& path)
+{
+    return path.string().insert(0, s);
+}
+inline std::string operator+(const CK::fs::path& path, const std::string_view s)
+{
+    return path.string().append(s);
+}
+#define FS_ENUM_PERMS_ALL fs::perms::all
+#if CK_HAS_FILESYSTEM_TS
+#ifdef __linux__
+#include <linux/limits.h>
+namespace CK {
+inline fs::path weakly_canonical(const fs::path& path)
+{
+    std::string result(PATH_MAX, '\0');
+    std::string p{path.is_relative() ? (fs::current_path() / path).string() : path.string()};
+    char* retval = realpath(p.c_str(), &result[0]);
+    return (retval == nullptr) ? path : fs::path{result};
+}
+} // namespace CK
+#else
+#error "Not implmeneted!"
+#endif
+#else
+namespace CK {
+inline fs::path weakly_canonical(const fs::path& path) { return fs::weakly_canonical(path); }
+} // namespace CK
+#endif
+namespace CK {
+#ifdef _WIN32
+constexpr std::string_view executable_postfix{".exe"};
+constexpr std::string_view library_prefix{""};
+constexpr std::string_view dynamic_library_postfix{".dll"};
+constexpr std::string_view static_library_postfix{".lib"};
+constexpr std::string_view object_file_postfix{".obj"};
+#else
+constexpr std::string_view executable_postfix{""};
+constexpr std::string_view library_prefix{"lib"};
+constexpr std::string_view dynamic_library_postfix{".so"};
+constexpr std::string_view static_library_postfix{".a"};
+constexpr std::string_view object_file_postfix{".o"};
+#endif
+inline fs::path make_executable_name(const fs::path& path)
+{
+    return path.parent_path() / (path.filename() + executable_postfix);
+}
+inline fs::path make_dynamic_library_name(const fs::path& path)
+{
+    return path.parent_path() / (library_prefix + path.filename() + dynamic_library_postfix);
+}
+inline fs::path make_object_file_name(const fs::path& path)
+{
+    return path.parent_path() / (path.filename() + object_file_postfix);
+}
+inline fs::path make_static_library_name(const fs::path& path)
+{
+    return path.parent_path() / (library_prefix + path.filename() + static_library_postfix);
+}
+struct FsPathHash
+{
+    std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); }
+};
+} // namespace CK
+#endif // GUARD_CK_FILESYSTEM_HPP_
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
@@ -406,7 +406,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
    }
    template <>
-    __device__ static constexpr auto TailScheduler<1>()
+    __device__ constexpr auto TailScheduler<1>()
    {
        // schedule
        constexpr auto num_ds_read_inst =
@@ -433,7 +433,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
    }
    template <>
-    __device__ static constexpr auto TailScheduler<2>()
+    __device__ constexpr auto TailScheduler<2>()
    {
        // schedule
        constexpr auto num_ds_read_inst =

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -446,7 +446,9 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                    });
                });
            });
-            __builtin_amdgcn_sched_barrier(0);
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
        }
    }

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_avgpool2d_bwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_avgpool2d_bwd_nhwc_nhwc.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -171,6 +171,16 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
                    Argument arg_ = arg;
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
@@ -179,11 +189,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
                    });
                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
-                        arg_,
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
-                        stream_config.rotating_count,
-                        arg_.M * arg_.K * sizeof(ADataType),
-                        arg_.K * arg_.N * sizeof(BDataType),
-                        DsSize);
                    rotating_mem.Print();
                    auto run_flush_cache = [&]() {

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -155,11 +155,19 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                if(stream_config.flush_cache)
                {
                    Argument arg_ = arg;
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
-                        arg_,
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
-                        stream_config.rotating_count,
-                        arg_.M * arg_.K * sizeof(ADataType),
-                        arg_.K * arg_.N * sizeof(BDataType));
                    rotating_mem.Print();
                    auto run_flush_cache = [&]() {