add test threadwise transfer. currently static_ford in threadwise transfer can...

add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size

add test threadwise transfer. currently static_ford in threadwise transfer can...
add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size
c0f698d5 · carlushuang · e6ee6594 · c0f698d5 · c0f698d5 · c0f698d5
Commit c0f698d5 authored Apr 14, 2022 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
+#ifndef CK_BLOCKWISE_GEMM_AVX2_HPP
+#define CK_BLOCKWISE_GEMM_AVX2_HPP
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "threadwise_gemm_avx2.hpp"
+namespace ck {
+namespace cpu {
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AccDataType,
+          typename ABlockDesc,
+          typename BBlockDesc,
+          typename CBlockDesc,
+          typename ABlockSliceLengths,
+          typename BBlockSliceLengths,
+          typename CBlockSliceLengths,
+          typename AThreadSliceLength,
+          typename BThreadSliceLength,
+          ck::index_t AThreadLoopOverDim, // thread slice loop over on block slice. 1d is enough for
+                                          // now
+          ck::index_t BThreadLoopOverDim,
+          ck::index_t KPerBlock,
+          typename ThreadwiseGemm_Dispatch,
+          typename ThreadMNAccessOrder // how we acces gemm MN to utilize micro kernel
+          >
+struct BlockwiseGemmAvx2_MxN
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr index_t nDimA = ABlockDesc::GetNumOfDimension();
+    static constexpr index_t nDimB = BBlockDesc::GetNumOfDimension();
+    static constexpr index_t nDimC = CBlockDesc::GetNumOfDimension();
+    using IndexA                   = MultiIndex<nDimA>;
+    using IndexB                   = MultiIndex<nDimB>;
+    using IndexC                   = MultiIndex<nDimC>;
+    using ACoord = decltype(make_tensor_coordinate(ABlockDesc{}, IndexA{}));
+    using BCoord = decltype(make_tensor_coordinate(BBlockDesc{}, IndexB{}));
+    using CCoord = decltype(make_tensor_coordinate(CBlockDesc{}, IndexC{}));
+#if 0
+    constexpr BlockwiseGemmAvx2_MxN(const ABlockDesc & a_block_desc, const IndexA& a_thread_origin,
+                                    const BBlockDesc & b_block_desc, const IndexB& b_thread_origin)
+        : a_thread_coord_(make_tensor_coordinate(a_block_desc, a_thread_origin)),
+          b_thread_coord_(make_tensor_coordinate(b_block_desc, b_thread_origin)),
+    {
+    }
+#endif
+    template <typename TensorDesc>
+    constexpr auto GetLeadingElement(const TensorDesc& desc)
+    {
+        // if use this function, make sure desc are known at compile time.
+        // otherwise, it is not efficient to calculate leading dim here
+        if constexpr(TensorDesc::GetNumOfDimension() == 1)
+        {
+            return 1;
+        }
+        else
+        {
+            constexpr auto last_dims =
+                typename uniform_sequence_gen<TensorDesc::GetNumOfDimension() - 1, 0>::type{};
+            constexpr auto lead_dims = decltype(last_dims)::PushFront(Number<1>{});
+            return desc.CalculateOffset(lead_dims);
+        }
+    }
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CBlockBuffer>
+    void Run(const ABlockDesc& a_block_desc,
+             const ABlockBuffer& a_block_buf,
+             const IndexA& a_origin,
+             const BBlockDesc& b_block_desc,
+             const BBlockBuffer& b_block_buf,
+             const IndexB& b_origin,
+             const CBlockDesc& c_block_desc,
+             CBlockBuffer& c_block_buf,
+             const IndexC& c_origin) const
+    {
+        constexpr auto m_n_block_length =
+            ck::Sequence<ABlockSliceLengths::At(AThreadLoopOverDim),
+                         BBlockSliceLengths::At(BThreadLoopOverDim)>{};
+        constexpr auto m_n_thread_length =
+            ck::Sequence<AThreadSliceLength::At(AThreadLoopOverDim),
+                         BThreadSliceLength::At(BThreadLoopOverDim)>{};
+        constexpr auto m_n_access_length = m_n_block_length / m_n_thread_length;
+        constexpr auto ordered_m_n_access_length =
+            container_reorder_given_new2old(m_n_access_length, ThreadMNAccessOrder{});
+        constexpr auto a_block_idx_zeros =
+            typename uniform_sequence_gen<nDimA, 0>::type{}; // starting point of the block
+        constexpr auto b_block_idx_zeros = typename uniform_sequence_gen<nDimB, 0>::type{};
+        constexpr auto lda = GetLeadingElement(a_block_desc) * sizeof(FloatA);
+        constexpr auto ldb = GetLeadingElement(b_block_desc) * sizeof(FloatB);
+        constexpr auto ldc = GetLeadingElement(c_block_desc) * sizeof(FloatC);
+        ck::cpu::ThreadwiseGemmParam param;
+        param.Kr    = KPerBlock;
+        param.lda   = lda;
+        param.ldb   = ldb;
+        param.ldc   = ldc;
+        param.alpha = 1.0f; // TODO
+        static_ford<decltype(ordered_m_n_access_length)>{}([&](auto ordered_idx) {
+            constexpr auto origin_m_n_idx = ordered_idx.ReorderGivenOld2New(ThreadMNAccessOrder{});
+            constexpr auto current_m_idx =
+                origin_m_n_idx.At(0) * AThreadSliceLength::At(AThreadLoopOverDim);
+            constexpr auto current_n_idx =
+                origin_m_n_idx.At(1) * BThreadSliceLength::At(BThreadLoopOverDim);
+            constexpr auto current_mr =
+                ck::math::min(m_n_block_length.At(0) - current_m_idx, m_n_thread_length.At(0));
+            constexpr auto current_nr =
+                ck::math::min(m_n_block_length.At(1) - current_n_idx, m_n_thread_length.At(1));
+            constexpr auto a_block_idx =
+                a_block_idx_zeros.Modify(AThreadLoopOverDim, current_m_idx);
+            constexpr auto a_block_coord =
+                make_tensor_coordinate(a_block_desc, to_multi_index(a_origin + a_block_idx));
+            constexpr auto b_block_idx =
+                b_block_idx_zeros.Modify(BThreadLoopOverDim, current_n_idx);
+            constexpr auto b_block_coord =
+                make_tensor_coordinate(b_block_desc, to_multi_index(b_origin + b_block_idx));
+            constexpr auto c_block_coord =
+                make_tensor_coordinate(c_block_desc, to_multi_index(c_origin + origin_m_n_idx));
+            param.p_a = &a_block_buf.p_data_[a_block_coord.GetOffset()];
+            param.p_b = &b_block_buf.p_data_[b_block_coord.GetOffset()];
+            param.p_c = &c_block_buf.p_data_[c_block_coord.GetOffset()];
+            ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
+        });
+    }
+};
+} // namespace cpu
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp
+#ifndef CONVOLUTION_FORWARD_SPECIALIZATION_CPU
+#define CONVOLUTION_FORWARD_SPECIALIZATION_CPU
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+enum ConvolutionForwardSpecialization_t
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    OddC,
+};
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_base_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_base_cpu.hpp
+#ifndef DEVICE_BASE_CPU_HPP
+#define DEVICE_BASE_CPU_HPP
+#include <string>
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+struct BaseArgument
+{
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
+    BaseArgument& operator=(const BaseArgument&) = default;
+    virtual ~BaseArgument() {}
+};
+struct BaseInvoker
+{
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker& operator=(const BaseInvoker&) = default;
+    virtual float Run(const BaseArgument*, int = 1) = 0;
+    virtual ~BaseInvoker() {}
+};
+struct BaseOperator
+{
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
+    BaseOperator& operator=(const BaseOperator&) = default;
+    virtual bool IsSupportedArgument(const BaseArgument*) = 0;
+    virtual std::string GetTypeString() const             = 0;
+    virtual ~BaseOperator() {}
+};
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp
+#ifndef DEVICE_CONV_FWD_CPU_HPP
+#define DEVICE_CONV_FWD_CPU_HPP
+#include <iostream>
+#include "device_base_cpu.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+using DeviceConvFwdPtr = std::unique_ptr<
+    DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
+#ifndef DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_HPP
+#define DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_HPP
+#include <iostream>
+#include <sstream>
+#include <numeric>
+#include "device.hpp"
+#include "device_base_cpu.hpp"
+#include "device_conv_fwd_cpu.hpp"
+#include "convolution_forward_specialization_cpu.hpp"
+#include "common_header.hpp"
+#include "../../gpu/device/tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_avx2.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionForwardSpecialization_t ConvForwardSpecialization,
+          ck::index_t NumDimSpatial,
+          ck::index_t MPerBlock, // block means data are designed to fit in cache (L1/L2/L3)
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          typename ThreadwiseGemm_Dispatch>
+//  bool IsGemmMPadded,
+//  bool IsGemmNPadded,
+//  bool IsGemmKPadded>
+struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+    using AElementwiseOperation = InElementwiseOperation;
+    using BElementwiseOperation = WeiElementwiseOperation;
+    using CElementwiseOperation = OutElementwiseOperation;
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+    static constexpr index_t NDimSpatial = NumDimSpatial;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static auto GetWeightTensorDescriptor(ck::index_t gemm_k, ck::index_t gemm_n)
+    {
+        const auto wei_gemm_n_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(gemm_n, gemm_k));
+        const auto wei_gemm_n0_k_n1_grid_desc = transform_tensor_descriptor(
+            wei_gemm_n_k_grid_desc,
+            ck::make_tuple(ck::make_unmerge_transform(
+                               ck::make_tuple(wei_gemm_n_k_grid_desc.GetLength(I0) /
+                                                  ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
+                                              ThreadwiseGemm_Dispatch::MatrixBMinVectorSize)),
+                           ck::make_pass_through_transform(wei_gemm_n_k_grid_desc.GetLength(I1))),
+            ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
+            ck::make_tuple(ck::Sequence<0, 2>{}, ck::Sequence<1>{}));
+        return wei_gemm_n0_k_n1_grid_desc;
+    }
+    static auto GetOutputTensorDescriptor(ck::index_t gemm_m, ck::index_t gemm_n)
+    {
+        const auto out_gemm_m_n_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_n));
+        return out_gemm_m_n_grid_desc;
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 1, bool>::type = false>
+    static auto GetInputTensorDescriptor(ck::index_t N,
+                                         ck::index_t C,
+                                         ck::index_t gemm_m,
+                                         ck::index_t gemm_k,
+                                         const std::vector<ck::index_t>& input_spatial_lengths,
+                                         const std::vector<ck::index_t>& filter_spatial_lengths,
+                                         const std::vector<ck::index_t>& output_spatial_lengths,
+                                         const std::vector<ck::index_t>& conv_filter_strides,
+                                         const std::vector<ck::index_t>& conv_filter_dilations,
+                                         const std::vector<ck::index_t>& input_left_pads,
+                                         const std::vector<ck::index_t>& input_right_pads)
+    {
+        const index_t Wi          = input_spatial_lengths[0];
+        const index_t Wo          = output_spatial_lengths[0];
+        const index_t ConvStrideW = conv_filter_strides[0];
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
+        {
+            const auto in_gemm_m_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
+            return in_gemm_m_k_grid_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
+        {
+            const auto in_n_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+            const auto in_n_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
+                in_n_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+        else
+        {
+            const index_t X             = filter_spatial_lengths[0];
+            const index_t ConvDilationW = conv_filter_dilations[0];
+            const index_t InLeftPadW    = input_left_pads[0];
+            const index_t InRightPadW   = input_right_pads[0];
+            const auto in_n_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+            const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+            const auto in_gemm_m_k_grid_desc =
+                transform_tensor_descriptor(in_n_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                                                       make_merge_transform(make_tuple(X, C))),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 2, bool>::type = false>
+    static auto GetInputTensorDescriptor(ck::index_t N,
+                                         ck::index_t C,
+                                         ck::index_t gemm_m,
+                                         ck::index_t gemm_k,
+                                         const std::vector<ck::index_t>& input_spatial_lengths,
+                                         const std::vector<ck::index_t>& filter_spatial_lengths,
+                                         const std::vector<ck::index_t>& output_spatial_lengths,
+                                         const std::vector<ck::index_t>& conv_filter_strides,
+                                         const std::vector<ck::index_t>& conv_filter_dilations,
+                                         const std::vector<ck::index_t>& input_left_pads,
+                                         const std::vector<ck::index_t>& input_right_pads)
+    {
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
+        {
+            const auto in_gemm_m_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
+            return in_gemm_m_k_grid_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
+        {
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+            const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            const auto in_gemm_m_k_grid_desc =
+                transform_tensor_descriptor(in_n_ho_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+        else
+        {
+            const index_t Y = filter_spatial_lengths[0];
+            const index_t X = filter_spatial_lengths[1];
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+            const index_t InLeftPadH = input_left_pads[0];
+            const index_t InLeftPadW = input_left_pads[1];
+            const index_t InRightPadH = input_right_pads[0];
+            const index_t InRightPadW = input_right_pads[1];
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+            const auto in_gemm_m_k_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_merge_transform(make_tuple(Y, X, C))),
+                                            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
+    static auto GetInputTensorDescriptor(ck::index_t N,
+                                         ck::index_t C,
+                                         ck::index_t gemm_m,
+                                         ck::index_t gemm_k,
+                                         ck::index_t gemm_m_pad,
+                                         const std::vector<ck::index_t>& input_spatial_lengths,
+                                         const std::vector<ck::index_t>& filter_spatial_lengths,
+                                         const std::vector<ck::index_t>& output_spatial_lengths,
+                                         const std::vector<ck::index_t>& conv_filter_strides,
+                                         const std::vector<ck::index_t>& conv_filter_dilations,
+                                         const std::vector<ck::index_t>& input_left_pads,
+                                         const std::vector<ck::index_t>& input_right_pads)
+    {
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
+        {
+            const auto in_gemm_m_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
+            return in_gemm_m_k_grid_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
+        {
+            const auto in_n_di_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+            const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+            const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
+                in_n_do_ho_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+        else
+        {
+            const index_t Z = filter_spatial_lengths[0];
+            const index_t Y = filter_spatial_lengths[1];
+            const index_t X = filter_spatial_lengths[2];
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+            const auto in_n_di_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+            const auto in_gemm_m_k_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_merge_transform(make_tuple(Z, Y, X, C))),
+                make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return in_gemm_m_k_grid_desc;
+        }
+    }
+    static index_t GetGemmM(ck::index_t N, const std::vector<ck::index_t>& output_spatial_lengths)
+    {
+        return N * std::accumulate(std::begin(output_spatial_lengths),
+                                   std::end(output_spatial_lengths),
+                                   1,
+                                   std::multiplies<ck::index_t>());
+    }
+    static index_t GetGemmK(ck::index_t C, const std::vector<ck::index_t>& filter_spatial_lengths)
+    {
+        return C * std::accumulate(std::begin(filter_spatial_lengths),
+                                   std::end(filter_spatial_lengths),
+                                   1,
+                                   std::multiplies<ck::index_t>());
+    }
+    static auto MakeABCGridDescriptor(ck::index_t N,
+                                      ck::index_t K,
+                                      ck::index_t C,
+                                      std::vector<ck::index_t> input_spatial_lengths,
+                                      std::vector<ck::index_t> filter_spatial_lengths,
+                                      std::vector<ck::index_t> output_spatial_lengths,
+                                      std::vector<ck::index_t> conv_filter_strides,
+                                      std::vector<ck::index_t> conv_filter_dilations,
+                                      std::vector<ck::index_t> input_left_pads,
+                                      std::vector<ck::index_t> input_right_pads)
+    {
+        using namespace ck;
+        const index_t GemmM = GetGemmM(N, output_spatial_lengths);
+        const index_t GemmN = K;
+        const index_t GemmK = GetGemmK(C, filter_spatial_lengths);
+        // A:
+        const auto in_gemm_m_k_grid_desc =
+            GetInputTensorDescriptor<NumDimSpatial>(N,
+                                                    C,
+                                                    GemmM,
+                                                    GemmK,
+                                                    input_spatial_lengths,
+                                                    filter_spatial_lengths,
+                                                    output_spatial_lengths,
+                                                    conv_filter_strides,
+                                                    conv_filter_dilations,
+                                                    input_left_pads,
+                                                    input_right_pads);
+        // B:
+        const auto wei_gemm_n0_k_n1_grid_desc = GetWeightTensorDescriptor(GemmK, GemmN);
+        // C:
+        const auto out_gemm_m_n_grid_desc = GetOutputTensorDescriptor(GemmM, GemmN);
+        return make_tuple(
+            in_gemm_m_k_grid_desc, wei_gemm_n0_k_n1_grid_desc, out_gemm_m_n_grid_desc);
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 1, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor(1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1});
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 2, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor(
+            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1});
+    }
+    template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor(
+            1, 1, 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
+    }
+    using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
+    using AGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+    static constexpr bool UseCLocalBuffer = true;
+    using GridwiseGemm =
+        ck::cpu::GridwiseGemmAvx2_MxN<InDataType,              // InDataType,
+                                      WeiDataType,             // WeiDataType,
+                                      OutDataType,             // OutDataType,
+                                      AccDataType,             // AccDataType,
+                                      AGridDesc,               // AGridDesc,
+                                      BGridDesc,               // BGridDesc,
+                                      CGridDesc,               // CGridDesc,
+                                      AElementwiseOperation,   // AElementwiseOperation,
+                                      BElementwiseOperation,   // BElementwiseOperation,
+                                      CElementwiseOperation,   // CElementwiseOperation,
+                                      MPerBlock,               // MPerBlock,
+                                      NPerBlock,               // NPerBlock,
+                                      KPerBlock,               // KPerBlock,
+                                      ThreadwiseGemm_Dispatch, // ThreadwiseGemm_Dispatch,
+                                      ck::Sequence<0, 1, 2>,   // BlockMNKAccessOrder,
+                                      ck::Sequence<0, 1>,      // ThreadMNAccessOrder
+                                      UseCLocalBuffer          // UseCLocalBuffer
+                                      >;
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_out_grid},
+              a_grid_desc_{},
+              b_grid_desc_{},
+              c_grid_desc_{},
+              a_element_op_{in_element_op},
+              b_element_op_{wei_element_op},
+              c_element_op_{out_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const auto descs = DeviceOp::MakeABCGridDescriptor(N,
+                                                               K,
+                                                               C,
+                                                               input_spatial_lengths,
+                                                               filter_spatial_lengths,
+                                                               output_spatial_lengths,
+                                                               conv_filter_strides,
+                                                               conv_filter_dilations,
+                                                               input_left_pads,
+                                                               input_right_pads);
+            a_grid_desc_     = descs[I0];
+            b_grid_desc_     = descs[I1];
+            c_grid_desc_     = descs[I2];
+        }
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc a_grid_desc_;
+        BGridDesc b_grid_desc_;
+        CGridDesc c_grid_desc_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> conv_filter_strides_;
+        std::vector<index_t> input_left_pads_;
+        std::vector<index_t> input_right_pads_;
+    };
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, int nrepeat = 1)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_, arg.c_grid_desc_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
+            }
+            const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             AGridDesc,
+                                                             BGridDesc,
+                                                             CGridDesc,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>;
+            float ave_time = launch_and_time_cpu_kernel(kernel,
+                                                        nrepeat,
+                                                        arg.p_a_grid_,
+                                                        arg.p_b_grid_,
+                                                        arg.p_c_grid_,
+                                                        arg.a_grid_desc_,
+                                                        arg.b_grid_desc_,
+                                                        arg.c_grid_desc_,
+                                                        arg.a_element_op_,
+                                                        arg.b_element_op_,
+                                                        arg.c_element_op_);
+            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
+            // result
+            memset(arg.p_c_grid_, 0, arg.a_grid_desc_.GetElementSpaceSize());
+            launch_cpu_kernel(kernel,
+                              arg.p_a_grid_,
+                              arg.p_b_grid_,
+                              arg.p_c_grid_,
+                              arg.a_grid_desc_,
+                              arg.b_grid_desc_,
+                              arg.c_grid_desc_,
+                              arg.a_element_op_,
+                              arg.b_element_op_,
+                              arg.c_element_op_);
+            return ave_time;
+        }
+        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_, arg.c_grid_desc_);
+    }
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        const void* p_wei_grid,
+                        void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceConv" << std::to_string(NumDimSpatial) 
+            << "DFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
+#pragma once
+#include "data_type_cpu.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace element_wise {
+using float8_t = ck::cpu::float8_t;
+using float4_t = ck::cpu::float4_t;
+struct PassThrough
+{
+    void operator()(float& y, const float& x) const { y = x; }
+    void operator()(float4_t& y, const float4_t& x) const { y = x; }
+    void operator()(float8_t& y, const float8_t& x) const { y = x; }
+};
+struct Add
+{
+    void operator()(float& y, const float& x0, const float& x1) const { y = x0 + x1; }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_add_ps(x0, x1);
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_add_ps(x0, x1);
+    }
+};
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        y = alpha_ * x0 + beta_ * x1;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_add_ps(_mm_mul_ps(x0, _mm_set1_ps(alpha_)), _mm_mul_ps(x1, _mm_set1_ps(beta_)));
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_add_ps(_mm256_mul_ps(x0, _mm256_set1_ps(alpha_)),
+                          _mm256_mul_ps(x1, _mm256_set1_ps(beta_)));
+    }
+    float alpha_;
+    float beta_;
+};
+struct AddRelu
+{
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > 0 ? a : 0;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_max_ps(_mm_add_ps(x0, x1), _mm_setzero_ps());
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_max_ps(_mm256_add_ps(x0, x1), _mm256_setzero_ps());
+    }
+};
+#if 0
+struct AddHardswish
+{
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        y       = c;
+    }
+    void
+    operator()(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        y       = c;
+    }
+};
+#endif
+struct AddReluAdd
+{
+    void operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1, const float4_t& x2) const
+    {
+        float4_t a = _mm_add_ps(x0, x1);
+        float4_t b = _mm_max_ps(a, _mm_setzero_ps());
+        y          = _mm_add_ps(b, x2);
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1, const float8_t& x2) const
+    {
+        float8_t a = _mm256_add_ps(x0, x1);
+        float8_t b = _mm256_max_ps(a, _mm256_setzero_ps());
+        y          = _mm256_add_ps(b, x2);
+    }
+};
+#if 0
+struct AddHardswishAdd
+{
+    void
+    operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+    void
+    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+};
+#endif
+#if 0
+struct RequantReluRequant
+{
+    // FIXME: We just need one scale for Relu / Leaky Relu / PRelu
+    RequantReluRequant(float scaleGemm, float scaleRelu)
+        : scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
+    {
+    }
+    void operator()(int8_t& y, const int& x) const
+    {
+        float gemm_requant = scaleGemm_ * static_cast<float>(x);
+        float relu         = gemm_requant > 0 ? gemm_requant : 0;
+        float relu_requant = scaleRelu_ * relu;
+        y                  = static_cast<int8_t>(relu_requant > 127 ? 127
+                                                   : relu_requant < -128 ? -128 : relu_requant);
+    }
+    // for reference_gemm
+    void operator()(float& y, const float& x) const
+    {
+        float gemm_requant = scaleGemm_ * x;
+        float relu         = gemm_requant > 0 ? gemm_requant : 0;
+        float relu_requant = scaleRelu_ * relu;
+        y                  = static_cast<float>(relu_requant > 127 ? 127
+                                                  : relu_requant < -128 ? -128 : relu_requant);
+    }
+    float scaleGemm_;
+    float scaleRelu_;
+};
+#endif
+// Unary operators are usually called element-wisely before/after the reduction is executed on the
+// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
+template <typename Y, typename X, bool HasDividing = false>
+struct UnaryIdentic;
+template <>
+struct UnaryIdentic<float, float, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float, float, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float& y, const float& x) const { y = x / type_convert<float>(divider_); };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnaryIdentic<float4_t, float4_t, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float4_t, float4_t, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        y = _mm_div_ps(x, _mm_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnaryIdentic<float8_t, float8_t, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float8_t, float8_t, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        y = _mm256_div_ps(x, _mm256_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <typename Y, typename X, bool HasDividing = false>
+struct UnarySquare;
+template <>
+struct UnarySquare<float, float, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = x * x; };
+};
+template <>
+struct UnarySquare<float, float, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float& y, const float& x) const { y = x * x / type_convert<float>(divider_); };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnarySquare<float4_t, float4_t, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const { y = _mm_mul_ps(x, x); };
+};
+template <>
+struct UnarySquare<float4_t, float4_t, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        y = _mm_div_ps(_mm_mul_ps(x, x), _mm_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnarySquare<float8_t, float8_t, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const { y = _mm256_mul_ps(x, x); };
+};
+template <>
+struct UnarySquare<float8_t, float8_t, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        y = _mm256_div_ps(_mm256_mul_ps(x, x), _mm256_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <typename Y, typename X>
+struct UnaryAbs;
+template <>
+struct UnaryAbs<float, float>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = abs(x); };
+};
+template <>
+struct UnaryAbs<float4_t, float4_t>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        __m128 Mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
+        y           = _mm_and_ps(Mask, x);
+    };
+};
+template <>
+struct UnaryAbs<float8_t, float8_t>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        __m256 Mask = _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));
+        y           = _mm256_and_ps(Mask, x);
+    };
+};
+template <typename Y, typename X>
+struct UnarySqrt;
+template <>
+struct UnarySqrt<float, float>
+{
+    void operator()(float& y, const float& x) const { y = sqrtf(x); };
+};
+template <>
+struct UnarySqrt<float4_t, float4_t>
+{
+    void operator()(float4_t& y, const float4_t& x) const { y = _mm_sqrt_ps(x); };
+};
+template <>
+struct UnarySqrt<float8_t, float8_t>
+{
+    void operator()(float8_t& y, const float8_t& x) const { y = _mm256_sqrt_ps(x); };
+};
+} // namespace element_wise
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
+#ifndef CK_GRIDWISE_GEMM_AVX2_HPP
+#define CK_GRIDWISE_GEMM_AVX2_HPP
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_avx2.hpp"
+#include "threadwise_tensor_slice_transfer_avx2.hpp"
+#include "dynamic_buffer_cpu.hpp"
+namespace ck {
+namespace cpu {
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AGridDesc,
+          typename BGridDesc,
+          typename CGridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void kernel_gemm_avx_mxn(const FloatA* __restrict__ p_a_grid,
+                         const FloatB* __restrict__ p_b_grid,
+                         FloatC* __restrict__ p_c_grid,
+                         const AGridDesc& a_grid_desc,
+                         const BGridDesc& b_grid_desc,
+                         const CGridDesc& c_grid_desc,
+                         const AElementwiseOperation& a_element_op,
+                         const BElementwiseOperation& b_element_op,
+                         const CElementwiseOperation& c_element_op)
+{
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      a_grid_desc,
+                      b_grid_desc,
+                      c_grid_desc,
+                      a_element_op,
+                      b_element_op,
+                      c_element_op);
+}
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AccDataType,
+          typename AGridDesc,
+          typename BGridDesc,
+          typename CGridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          ck::index_t MPerBlock, // block means data are designed to fit in cache (L1/L2/L3)
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          typename ThreadwiseGemm_Dispatch,
+          typename BlockMNKAccessOrder, // how we accss gemm MNK to better fit in cache
+          typename ThreadMNAccessOrder, // how we acces gemm MN to utilize micro kernel
+          bool UseCLocalBuffer // if true, will allocate a buffer and write to it in kernel, then
+                               // copy back to block buffer. if false, will write to C directly
+          >
+struct GridwiseGemmAvx2_MxN
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    // static constexpr auto Avx2RegisterVector = 8;   // 8 floats
+    static constexpr index_t MemAlignmentByte = 32; // 256bit
+    static constexpr auto GetABlockDescriptor()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            constexpr auto a_block_desc_m_k =
+                make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, KPerBlock));
+            return a_block_desc_m_k;
+        }
+        else
+        {
+            // A : K, M
+            constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_packed(
+                make_tuple(KPerBlock,
+                           math::integer_least_multiple(
+                               MPerBlock, ThreadwiseGemm_Dispatch::MatrixAMinVectorSize)));
+            return a_block_desc_k_m;
+        }
+    }
+    static constexpr auto GetBBlockDescriptor()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_packed(
+                make_tuple(KPerBlock,
+                           math::integer_least_multiple(
+                               NPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize)));
+            return b_block_desc_k_n;
+        }
+        else
+        {
+            // B : N/8, K, N8
+            constexpr auto b_block_desc_n0_k_n1 = make_naive_tensor_descriptor_packed(make_tuple(
+                math::integer_divide_ceil(NPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize),
+                KPerBlock,
+                ThreadwiseGemm_Dispatch::MatrixBMinVectorSize));
+            return b_block_desc_n0_k_n1;
+        }
+    }
+    static constexpr auto GetABlockSliceLength()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            return ck::Sequence<MPerBlock, KPerBlock>{};
+        }
+        else
+        {
+            // A : K, M
+            return ck::Sequence<KPerBlock, MPerBlock>{};
+        }
+    }
+    static constexpr auto GetBBlockSliceLength()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return ck::Sequence<KPerBlock, NPerBlock>{};
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return ck::Sequence<NPerBlock / ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
+                                KPerBlock,
+                                ThreadwiseGemm_Dispatch::MatrixBMinVectorSize>{};
+        }
+    }
+    static constexpr auto GetABlockDimAccessOrder() { return ck::Sequence<0, 1>{}; }
+    static constexpr auto GetBBlockDimAccessOrder()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return ck::Sequence<0, 1>{};
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return ck::Sequence<0, 1, 2>{};
+        }
+    }
+    static constexpr auto GetABlockMoveFwdStep()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            return ck::make_multi_index(0, KPerBlock);
+        }
+        else
+        {
+            // A : K, M
+            return ck::make_multi_index(KPerBlock, 0);
+        }
+    }
+    static constexpr auto GetBBlockMoveFwdStep()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return ck::make_multi_index(KPerBlock, 0);
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return ck::make_multi_index(0, KPerBlock, 0);
+        }
+    }
+#if 0
+    static constexpr auto GetAThreadDiscriptor()
+    {
+        if constexpr (std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout, ck::tensor_layout::gemm::RowMajor>::value){
+            // A : M, K
+            constexpr auto a_thread_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(ThreadwiseGemm_Dispatch::ThreadMaxMr, KPerBlock));
+            return a_thread_desc_m_k;
+        } else {
+            // A : K, M
+            constexpr auto a_thread_desc_k_m = make_naive_tensor_descriptor_packed(make_tuple(KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxMr));
+            return a_thread_desc_k_m;
+        }
+    }
+    static constexpr auto GetBThreadDescriptor()
+    {
+        if constexpr (std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout, ck::tensor_layout::gemm::RowMajor>::value){
+            // B : K, N
+            constexpr auto b_thread_desc_k_n = make_naive_tensor_descriptor_packed(make_tuple(KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxNr));
+            return b_thread_desc_k_n;
+        } else {
+            // B : N/8, K, N8
+            constexpr auto b_thread_desc_n_k_n8 = make_naive_tensor_descriptor_packed(make_tuple(math::integer_divide_ceil(ThreadwiseGemm_Dispatch::ThreadMaxNr, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize), KPerBlock, ThreadwiseGemm_Dispatch::MatrixBMinVectorSize));
+            return b_thread_desc_n_k_n8;
+        }
+    }
+#endif
+    static constexpr auto GetAThreadSliceLength()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxMr, KPerBlock>{};
+        }
+        else
+        {
+            // A : K, M
+            return ck::Sequence<KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxMr>{};
+        }
+    }
+    static constexpr auto GetBThreadSliceLength()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return ck::Sequence<KPerBlock, ThreadwiseGemm_Dispatch::ThreadMaxNr>{};
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxNr /
+                                    ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
+                                KPerBlock,
+                                ThreadwiseGemm_Dispatch::MatrixBMinVectorSize>{};
+        }
+    }
+    static constexpr auto GetAThreadMoveFwdStep()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            return ck::make_multi_index(ThreadwiseGemm_Dispatch::ThreadMaxMr, 0);
+        }
+        else
+        {
+            // A : K, M
+            return ck::make_multi_index(0, ThreadwiseGemm_Dispatch::ThreadMaxMr);
+        }
+    }
+    static constexpr auto GetBThreadMoveFwdStep()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return ck::make_multi_index(0, ThreadwiseGemm_Dispatch::ThreadMaxNr);
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return ck::Sequence<ThreadwiseGemm_Dispatch::ThreadMaxNr /
+                                    ThreadwiseGemm_Dispatch::MatrixBMinVectorSize,
+                                0,
+                                0>{};
+        }
+    }
+    static constexpr ck::index_t GetAThreadLoopOverDim()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixALayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // A : M, K
+            return 0;
+        }
+        else
+        {
+            // A : K, M
+            return 1;
+        }
+    }
+    static constexpr ck::index_t GetBThreadLoopOverDim()
+    {
+        if constexpr(std::is_same<typename ThreadwiseGemm_Dispatch::MatrixBLayout,
+                                  ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            // B : K, N
+            return 1;
+        }
+        else
+        {
+            // B : N/8, K, N88;
+            return 0;
+        }
+    }
+    static constexpr auto GetCBlockDescriptor()
+    {
+        if constexpr(UseCLocalBuffer)
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, NPerBlock));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor_packed(make_tuple(MPerBlock, NPerBlock)); // TODO:
+        }
+    }
+    static constexpr auto GetCBlockSliceLength() { return ck::Sequence<MPerBlock, NPerBlock>{}; }
+    static constexpr bool CheckValidity(const AGridDesc& a_grid_desc,
+                                        const BGridDesc& b_grid_desc,
+                                        const CGridDesc& c_grid_desc)
+    {
+#if 0
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+             K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+             K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
+            return false;
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+        // check NumPrefetch
+        if constexpr(NumPrefetch == 1)
+        {
+            // 1-stage prefetch always supported
+        }
+        else if constexpr(NumPrefetch == 2)
+        {
+            // 2-stage prefetch currently only support even number of K0 loop
+            // TODO: add support for odd number of K0 loop
+            if(!((K0 / K0PerBlock) % 2 == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // check M01, N01
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+        if(!(M0 % M01 == 0 && N0 % N01 == 0))
+            return false;
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+    static void Run(const FloatA* __restrict__ p_a_grid,
+                    const FloatB* __restrict__ p_b_grid,
+                    FloatC* __restrict__ p_c_grid,
+                    const AGridDesc& a_grid_desc,
+                    const BGridDesc& b_grid_desc,
+                    const CGridDesc& c_grid_desc,
+                    const AElementwiseOperation& a_element_op,
+                    const BElementwiseOperation& b_element_op,
+                    const CElementwiseOperation& c_element_op)
+    {
+        ck::index_t m_per_block;
+        ck::index_t n_per_block;
+        ck::index_t k_per_block;
+        if constexpr(MPerBlock == 0 && NPerBlock == 0 && KPerBlock == 0) {}
+        else
+        {
+            m_per_block = MPerBlock;
+            n_per_block = NPerBlock;
+            k_per_block = KPerBlock;
+        }
+        const auto M = a_grid_desc.GetLength(I0);
+        const auto N = b_grid_desc.GetLength(I1);
+        const auto K = b_grid_desc.GetLength(I0);
+        const ck::index_t grid_m = math::integer_divide_ceil(M, m_per_block);
+        const ck::index_t grid_n = math::integer_divide_ceil(N, n_per_block);
+        const ck::index_t grid_size = grid_m * grid_n;
+        constexpr auto a_block_desc           = GetABlockDescriptor();
+        constexpr auto a_block_slice_length   = GetABlockSliceLength();
+        constexpr auto a_block_copy_dim       = decltype(a_block_slice_length)::Size();
+        constexpr auto a_dim_access_order     = GetABlockDimAccessOrder();
+        constexpr auto a_block_move_step      = GetABlockMoveFwdStep();
+        constexpr auto a_thread_slice_length  = GetAThreadSliceLength();
+        constexpr auto a_thread_loop_over_dim = GetAThreadLoopOverDim();
+        constexpr auto b_block_desc           = GetBBlockDescriptor();
+        constexpr auto b_block_slice_length   = GetBBlockSliceLength();
+        constexpr auto b_block_copy_dim       = decltype(b_block_slice_length)::Size();
+        constexpr auto b_dim_access_order     = GetBBlockDimAccessOrder();
+        constexpr auto b_block_move_step      = GetBBlockMoveFwdStep();
+        constexpr auto b_thread_slice_length  = GetBThreadSliceLength();
+        constexpr auto b_thread_loop_over_dim = GetBThreadLoopOverDim();
+        constexpr auto c_block_desc         = GetCBlockDescriptor();
+        constexpr auto c_block_slice_length = GetCBlockSliceLength();
+        constexpr auto c_block_move_step    = ck::make_multi_index(0, NPerBlock);
+        auto a_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
+            FloatA,                               // SrcData
+            FloatA,                               // DstData
+            decltype(a_grid_desc),                // SrcDesc
+            decltype(a_block_desc),               // DstDesc
+            AElementwiseOperation,                // ElementwiseOperation
+            decltype(a_block_slice_length),       // SliceLengths
+            decltype(a_dim_access_order),         // DimAccessOrder
+            1,                                    // VectorDim
+            1,                                    // ScalarPerVector
+            ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
+            false,                                // SrcResetCoordinateAfterRun
+            true                                  // DstResetCoordinateAfterRun
+            >(a_grid_desc,
+              ck::make_zero_multi_index<a_block_copy_dim>(),
+              a_block_desc,
+              ck::make_zero_multi_index<a_block_copy_dim>(),
+              AElementwiseOperation{});
+        auto b_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
+            FloatB,                               // SrcData
+            FloatB,                               // DstData
+            decltype(b_grid_desc),                // SrcDesc
+            decltype(b_block_desc),               // DstDesc
+            BElementwiseOperation,                // ElementwiseOperation
+            decltype(b_block_slice_length),       // SliceLengths
+            decltype(b_dim_access_order),         // DimAccessOrder
+            1,                                    // VectorDim
+            1,                                    // ScalarPerVector
+            ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
+            false,                                // SrcResetCoordinateAfterRun
+            true                                  // DstResetCoordinateAfterRun
+            >(b_grid_desc,
+              ck::make_zero_multi_index<b_block_copy_dim>(),
+              b_block_desc,
+              ck::make_zero_multi_index<b_block_copy_dim>(),
+              BElementwiseOperation{});
+        auto c_threadwise_copy = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
+            FloatC,                               // SrcData
+            FloatC,                               // DstData
+            decltype(c_block_desc),               // SrcDesc
+            decltype(c_grid_desc),                // DstDesc
+            BElementwiseOperation,                // ElementwiseOperation
+            ck::Sequence<MPerBlock, NPerBlock>,   // SliceLengths
+            ck::Sequence<0, 1>,                   // DimAccessOrder
+            1,                                    // VectorDim
+            1,                                    // ScalarPerVector
+            ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
+            true,                                 // SrcResetCoordinateAfterRun
+            false                                 // DstResetCoordinateAfterRun
+            >(c_block_desc,
+              ck::make_zero_multi_index<2>(),
+              c_grid_desc,
+              ck::make_zero_multi_index<2>(),
+              CElementwiseOperation{});
+        DeviceAlignedMemCPU a_block_mem(MPerBlock * KPerBlock * sizeof(FloatA), MemAlignmentByte);
+        DeviceAlignedMemCPU b_block_mem(KPerBlock * NPerBlock * sizeof(FloatB), MemAlignmentByte);
+        DeviceAlignedMemCPU c_block_mem(MPerBlock * NPerBlock * sizeof(FloatC), MemAlignmentByte);
+        auto a_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<const FloatA*>(p_a_grid), a_grid_desc.GetElementSpaceSize());
+        auto b_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<const FloatB*>(p_b_grid), b_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<FloatC*>(p_c_grid), c_grid_desc.GetElementSpaceSize());
+        auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),
+            a_block_mem.mMemSize / sizeof(FloatA));
+        auto b_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<FloatB*>(b_block_mem.mpDeviceBuf),
+            b_block_mem.mMemSize / sizeof(FloatB));
+        auto c_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+            reinterpret_cast<FloatC*>(c_block_mem.mpDeviceBuf),
+            c_block_mem.mMemSize / sizeof(FloatC));
+        auto blockwise_gemm =
+            BlockwiseGemmAvx2_MxN<FloatA,                          // FloatA,
+                                  FloatB,                          // FloatB,
+                                  FloatC,                          // FloatC,
+                                  AccDataType,                     // AccDataType,
+                                  decltype(a_block_desc),          // ABlockDesc,
+                                  decltype(b_block_desc),          // BBlockDesc,
+                                  decltype(c_block_desc),          // CBlockDesc,
+                                  decltype(a_block_slice_length),  // ABlockSliceLengths,
+                                  decltype(b_block_slice_length),  // BBlockSliceLengths,
+                                  decltype(c_block_slice_length),  // CBlockSliceLengths,
+                                  decltype(a_thread_slice_length), // AThreadSliceLength,
+                                  decltype(b_thread_slice_length), // BThreadSliceLength,
+                                  a_thread_loop_over_dim,  // AThreadLoopOverDim,   // thread slice
+                                                           // loop over on block slice. 1d is enough
+                                                           // for now
+                                  b_thread_loop_over_dim,  // BThreadLoopOverDim,
+                                  KPerBlock,               // KPerBlock,
+                                  ThreadwiseGemm_Dispatch, // ThreadwiseGemm_Dispatch,
+                                  ThreadMNAccessOrder>{};  // ThreadMNAccessOrder  // how we acces
+                                                           // gemm MN to utilize micro kernel>{};
+        // TODO: openmp aware ordering
+        if constexpr(std::is_same<BlockMNKAccessOrder, ck::Sequence<0, 1, 2>>::value)
+        {
+#pragma omp parallel for
+            for(ck::index_t gid = 0; gid < grid_size; gid++)
+            {
+                ck::index_t i_mc = (gid / grid_n) * m_per_block;
+                ck::index_t i_nc = (gid % grid_n) * n_per_block;
+                ck::index_t mc_size = ck::math::min(M - i_mc, m_per_block);
+                ck::index_t nc_size = ck::math::min(N - i_nc, n_per_block);
+                // pack_b
+                b_threadwise_copy.RunGeneric(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
+                b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_move_step);
+                if(i_nc == 0)
+                {
+                    // pack_a
+                    a_threadwise_copy.RunGeneric(
+                        a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+                    a_threadwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_move_step);
+                }
+                for(ck::index_t i_kc = 0; i_kc < K; i_kc += k_per_block)
+                {
+                    ck::index_t kc_size = ck::math::min(K - i_kc, k_per_block);
+                    blockwise_gemm.Run(a_block_desc,
+                                       a_block_buf,
+                                       make_zero_multi_index<a_block_copy_dim>(),
+                                       b_block_desc,
+                                       b_block_buf,
+                                       make_zero_multi_index<b_block_copy_dim>(),
+                                       c_block_desc,
+                                       c_block_buf,
+                                       make_zero_multi_index<2>());
+                }
+                if constexpr(UseCLocalBuffer)
+                {
+                    c_threadwise_copy.RunGeneric(
+                        c_block_desc, c_block_buf, c_grid_desc, c_grid_buf);
+                    c_threadwise_copy.MoveDstSliceWindow(c_grid_desc, c_block_move_step);
+                }
+            }
+        }
+    }
+};
+} // namespace cpu
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
@@ -5,13 +5,23 @@
 #include <immintrin.h>
 #endif
 #include "common_header.hpp"
-#include "tensor_layout.hpp"
+#include "../../gpu/device/tensor_layout.hpp"
 #include "math.hpp"
 #include "threadwise_param.hpp"
 namespace ck {
 namespace cpu {
+#if 0
+struct ThreadWiseGemmAvx2_Base {
+    virtual void Run(ThreadwiseGemmParam* param) = 0;
+    virtual index_t GetMr() const = 0;
+    virtual index_t GetMaxMr() const = 0;
+    virtual index_t GetNr() const = 0;
+    virtual index_t GetMaxNr() const = 0;
+};
+#endif
 template <typename FloatA,
          typename FloatB,
          typename FloatC,
@@ -22,17 +32,17 @@ template <typename FloatA,
          bool NonTemporalStore>
 struct ThreadwiseGemmAvx2_MxN_6x16
 {
-    using ALayout_                          = ALayout;
+    using MatrixALayout                 = ALayout;
-    using BLayout_                          = BLayout;
+    using MatrixBLayout                 = BLayout;
-    static constexpr auto Mr_               = Mr;
+    static constexpr auto ThreadMaxMr   = 6;
-    static constexpr auto Nr_               = Nr;
+    static constexpr auto ThreadMaxNr   = 16;
-    static constexpr auto NonTemporalStore_ = NonTemporalStore;
+    static constexpr auto ThreadMr      = Mr;
+    static constexpr auto ThreadNr      = Nr;
+    static constexpr auto ThreadNTStore = NonTemporalStore;
-    __host__ constexpr ThreadwiseGemmAvx2_MxN_6x16()
+    static_assert(Mr <= 6 && Mr >= 1 && (Nr == 8 || Nr == 16), "wrong! Mr x Nr not valid");
-    {
-        static_assert(Mr <= 6 && Mr >= 1 && (Nr == 8 || Nr == 16), "wrong! Mr x Nr not valid");
+    static void Run(ThreadwiseGemmParam* param)
-    }
-    __host__ static void Run(ThreadwiseGemmParam* param)
    {
        /*  6x16 ukernel
         *
@@ -563,18 +573,18 @@ template <typename FloatA,
          bool NonTemporalStore>
 struct ThreadwiseGemmAvx2_MxN_4x24
 {
-    using ALayout_                          = ALayout;
+    using MatrixALayout                 = ALayout;
-    using BLayout_                          = BLayout;
+    using MatrixBLayout                 = BLayout;
-    static constexpr auto Mr_               = Mr;
+    static constexpr auto ThreadMaxMr   = 4;
-    static constexpr auto Nr_               = Nr;
+    static constexpr auto ThreadMaxNr   = 24;
-    static constexpr auto NonTemporalStore_ = NonTemporalStore;
+    static constexpr auto ThreadMr      = Mr;
+    static constexpr auto ThreadNr      = Nr;
-    __host__ constexpr ThreadwiseGemmAvx2_MxN_4x24()
+    static constexpr auto ThreadNTStore = NonTemporalStore;
-    {
-        static_assert(Mr <= 4 && Mr >= 1 && (Nr == 8 || Nr == 16 || Nr == 24),
+    static_assert(Mr <= 4 && Mr >= 1 && (Nr == 8 || Nr == 16 || Nr == 24),
-                      "wrong! Mr x Nr not valid");
+                  "wrong! Mr x Nr not valid");
-    }
-    __host__ static void Run(ThreadwiseGemmParam* param)
+    static void Run(ThreadwiseGemmParam* param)
    {
        /*  4x24 ukernel
         *
@@ -820,18 +830,18 @@ struct ThreadwiseGemmAvx2_MxN_4x24
            ".if (m_Mr > 2)\n lea  (%%rbx, %%rdi, 1), %%rcx\n .endif\n"
            ".if (m_Mr > 3)\n lea  (%%rcx, %%rdi, 1), %%rdx\n .endif\n"
-            "                               vaddps  (%%rax),    %%ymm0,  %%ymm0 \n"
+            // "                               vaddps  (%%rax),    %%ymm0,  %%ymm0 \n"
-            ".if               (m_Nr > 8)\n vaddps  32(%%rax),  %%ymm1,  %%ymm1 \n .endif\n"
+            // ".if               (m_Nr > 8)\n vaddps  32(%%rax),  %%ymm1,  %%ymm1 \n .endif\n"
-            ".if               (m_Nr >16)\n vaddps  64(%%rax),  %%ymm2,  %%ymm2 \n .endif\n"
+            // ".if               (m_Nr >16)\n vaddps  64(%%rax),  %%ymm2,  %%ymm2 \n .endif\n"
-            ".if (m_Mr > 1)              \n vaddps  (%%rbx),    %%ymm3,  %%ymm3 \n .endif\n"
+            // ".if (m_Mr > 1)              \n vaddps  (%%rbx),    %%ymm3,  %%ymm3 \n .endif\n"
-            ".if (m_Mr > 1) && (m_Nr > 8)\n vaddps  32(%%rbx),  %%ymm4,  %%ymm4 \n .endif\n"
+            // ".if (m_Mr > 1) && (m_Nr > 8)\n vaddps  32(%%rbx),  %%ymm4,  %%ymm4 \n .endif\n"
-            ".if (m_Mr > 1) && (m_Nr >16)\n vaddps  64(%%rbx),  %%ymm5,  %%ymm5 \n .endif\n"
+            // ".if (m_Mr > 1) && (m_Nr >16)\n vaddps  64(%%rbx),  %%ymm5,  %%ymm5 \n .endif\n"
-            ".if (m_Mr > 2)              \n vaddps  (%%rcx),    %%ymm6,  %%ymm6 \n .endif\n"
+            // ".if (m_Mr > 2)              \n vaddps  (%%rcx),    %%ymm6,  %%ymm6 \n .endif\n"
-            ".if (m_Mr > 2) && (m_Nr > 8)\n vaddps  32(%%rcx),  %%ymm7,  %%ymm7 \n .endif\n"
+            // ".if (m_Mr > 2) && (m_Nr > 8)\n vaddps  32(%%rcx),  %%ymm7,  %%ymm7 \n .endif\n"
-            ".if (m_Mr > 2) && (m_Nr >16)\n vaddps  64(%%rcx),  %%ymm8,  %%ymm8 \n .endif\n"
+            // ".if (m_Mr > 2) && (m_Nr >16)\n vaddps  64(%%rcx),  %%ymm8,  %%ymm8 \n .endif\n"
-            ".if (m_Mr > 3)              \n vaddps  (%%rdx),    %%ymm9,  %%ymm9 \n .endif\n"
+            // ".if (m_Mr > 3)              \n vaddps  (%%rdx),    %%ymm9,  %%ymm9 \n .endif\n"
-            ".if (m_Mr > 3) && (m_Nr > 8)\n vaddps  32(%%rdx),  %%ymm10, %%ymm10\n .endif\n"
+            // ".if (m_Mr > 3) && (m_Nr > 8)\n vaddps  32(%%rdx),  %%ymm10, %%ymm10\n .endif\n"
-            ".if (m_Mr > 3) && (m_Nr >16)\n vaddps  64(%%rdx),  %%ymm11, %%ymm11\n .endif\n"
+            // ".if (m_Mr > 3) && (m_Nr >16)\n vaddps  64(%%rdx),  %%ymm11, %%ymm11\n .endif\n"
            ".if m_NTStore == 0\n"
            "                               vmovups %%ymm0,     (%%rax)  \n"
@@ -1090,6 +1100,304 @@ struct ThreadwiseGemmAvx2_MxN_4x24
    }
 };
+typedef void (*pThreadwiseGemmAvx2Run)(ThreadwiseGemmParam*);
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ALayout, // default is k*m, trans->m*k
+          typename BLayout, // default is n/8*k*n8, trans->k*n
+          bool NonTemporalStore>
+struct ThreadwiseGemmAvx2_MxN_6x16_Dispatch
+{
+    using MatrixALayout               = ALayout;
+    using MatrixBLayout               = BLayout;
+    static constexpr auto ThreadMaxMr = 6;
+    static constexpr auto ThreadMaxNr = 16;
+    // static constexpr auto ThreadMr          = Mr;
+    // static constexpr auto ThreadNr          = Nr;
+    static constexpr auto ThreadNTStore = NonTemporalStore;
+    static constexpr auto MatrixAMinVectorSize =
+        std::is_same<ck::tensor_layout::gemm::RowMajor, ALayout>::value ? 1 : 8;
+    static constexpr auto MatrixBMinVectorSize =
+        std::is_same<ck::tensor_layout::gemm::RowMajor, BLayout>::value ? 8 : 8;
+    using ThreadwiseGemm_6x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              6,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_6x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             6,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_5x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              5,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_5x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             5,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_4x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              4,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_4x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             4,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_3x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              3,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_3x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             3,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_2x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              2,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_2x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             2,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_1x16_t = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              1,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_1x8_t  = ThreadwiseGemmAvx2_MxN_6x16<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             1,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    static constexpr pThreadwiseGemmAvx2Run dispatch_table[6][2] = {
+        {
+            ThreadwiseGemm_6x16_t::Run,
+            ThreadwiseGemm_6x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_5x16_t::Run,
+            ThreadwiseGemm_5x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_4x16_t::Run,
+            ThreadwiseGemm_4x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_3x16_t::Run,
+            ThreadwiseGemm_3x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_2x16_t::Run,
+            ThreadwiseGemm_2x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_1x16_t::Run,
+            ThreadwiseGemm_1x8_t::Run,
+        },
+    };
+    static void Run(ThreadwiseGemmParam* param, index_t mr, index_t nr)
+    {
+        return dispatch_table[mr][nr](param);
+    }
+};
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ALayout, // default is k*m, trans->m*k
+          typename BLayout, // default is n/8*k*n8, trans->k*n
+          bool NonTemporalStore>
+struct ThreadwiseGemmAvx2_MxN_4x24_Dispatch
+{
+    using MatrixALayout               = ALayout;
+    using MatrixBLayout               = BLayout;
+    static constexpr auto ThreadMaxMr = 4;
+    static constexpr auto ThreadMaxNr = 24;
+    // static constexpr auto ThreadMr          = Mr;
+    // static constexpr auto ThreadNr          = Nr;
+    static constexpr auto ThreadNTStore = NonTemporalStore;
+    static constexpr auto MatrixAMinVectorSize =
+        std::is_same<ck::tensor_layout::gemm::RowMajor, ALayout>::value ? 1 : 8;
+    static constexpr auto MatrixBMinVectorSize =
+        std::is_same<ck::tensor_layout::gemm::RowMajor, BLayout>::value ? 8 : 8;
+    using ThreadwiseGemm_4x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              4,
+                                                              24,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_4x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              4,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_4x8_t  = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             4,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_3x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              3,
+                                                              24,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_3x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              3,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_3x8_t  = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             3,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_2x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              2,
+                                                              24,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_2x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              2,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_2x8_t  = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             2,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    using ThreadwiseGemm_1x24_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              1,
+                                                              24,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_1x16_t = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                              FloatB,
+                                                              FloatC,
+                                                              1,
+                                                              16,
+                                                              ALayout,
+                                                              BLayout,
+                                                              NonTemporalStore>;
+    using ThreadwiseGemm_1x8_t  = ThreadwiseGemmAvx2_MxN_4x24<FloatA,
+                                                             FloatB,
+                                                             FloatC,
+                                                             1,
+                                                             8,
+                                                             ALayout,
+                                                             BLayout,
+                                                             NonTemporalStore>;
+    static constexpr pThreadwiseGemmAvx2Run dispatch_table[4][3] = {
+        {
+            ThreadwiseGemm_4x24_t::Run,
+            ThreadwiseGemm_4x16_t::Run,
+            ThreadwiseGemm_4x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_3x24_t::Run,
+            ThreadwiseGemm_3x16_t::Run,
+            ThreadwiseGemm_3x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_2x24_t::Run,
+            ThreadwiseGemm_2x16_t::Run,
+            ThreadwiseGemm_2x8_t::Run,
+        },
+        {
+            ThreadwiseGemm_1x24_t::Run,
+            ThreadwiseGemm_1x16_t::Run,
+            ThreadwiseGemm_1x8_t::Run,
+        },
+    };
+    static void Run(ThreadwiseGemmParam* param, index_t mr, index_t nr)
+    {
+        return dispatch_table[mr][nr](param);
+    }
+};
 } // namespace cpu
 } // namespace ck
 #endif
--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
+#include "common_header.hpp"
+#include "data_type_cpu.hpp"
+#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
+#include "dynamic_buffer_cpu.hpp"
+#include <immintrin.h>
+namespace ck {
+namespace cpu {
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   3. always use __mm256 register to hold continuous 8 dword, so if fast-changing
+//      dim is a complex dimension, better re-consider layout (e.g NCHW is not good if non 1x1)
+//   4. RunGeneric() can handle any case (by not using ymm), but performance are not guranteed
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector, // src/dst must use same vector size, aka src/dst both need same
+                                   // avx/float register
+          InMemoryDataOperationEnum_t DstInMemOp,
+          bool SrcResetCoordinateAfterRun,
+          bool DstResetCoordinateAfterRun>
+struct ThreadwiseTensorSliceTransferAvx2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+    static constexpr auto I0 = Number<0>{};
+    constexpr ThreadwiseTensorSliceTransferAvx2(const SrcDesc& src_desc,
+                                                const Index& src_slice_origin,
+                                                const DstDesc& dst_desc,
+                                                const Index& dst_slice_origin,
+                                                const ElementwiseOperation& element_op)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+    void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        // In GPU this function is used for set per-thread index based on threadIdx.x
+        // But for CPU, no need to call this function.
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+    void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+    template <typename SrcBuffer, typename DstBuffer>
+    void RunGeneric(const SrcDesc& src_desc,
+                    const SrcBuffer& src_buf,
+                    const DstDesc& dst_desc,
+                    DstBuffer& dst_buf)
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto scalar_per_access = generate_sequence(
+            ck::detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+        // loop over space-filling curve
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        // std::cout<<"num_access:"<<num_access<<std::endl;
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            using src_vector_type = ck::cpu::vector_type_maker_t<SrcData, ScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+            using dst_vector_type = ck::cpu::vector_type_maker_t<DstData, ScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+            // printf("[%s] ", is_src_valid ? "y":"n");
+            // print_multi_index(src_coord_.GetIndex());
+            // printf("----");
+            // print_multi_index(src_coord_.GetHiddenIndex());
+            // printf(":%d", src_coord_.GetOffset());
+            // printf("\n");
+            // copy data from src_buf into src_vector_container
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+            auto dst_vector_container = dst_vector_type{};
+            // apply pointwise operation
+            // static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+            //     element_op_(dst_vector_container.template AsType<DstData>()(i),
+            //                 src_vector_container.template AsType<SrcData>()[i]);
+            // });
+            element_op_(dst_vector_container.template AsType<dst_vector_t>(),
+                        src_vector_container.template AsType<src_vector_t>());
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+            // printf(" -> ");
+            // print_multi_index(dst_coord_.GetIndex());
+            // printf(":%d", dst_coord_.GetOffset());
+            // printf(", src:0x%x, dst:0x%x",
+            // *reinterpret_cast<uint32_t*>(&src_vector_container.template AsType<src_vector_t>()),
+            //                               *reinterpret_cast<uint32_t*>(&dst_vector_container.template
+            //                               AsType<dst_vector_t>()));
+            // printf("\n");
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>());
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+        // move coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+    static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+            return reset_step;
+        }
+    }
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateResetStep();
+        printf(" GetCoordinateResetStep:");
+        print_multi_index(GetCoordinateResetStep());
+        printf(" adjusted_step_idx:");
+        print_multi_index(adjusted_step_idx);
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+        printf(" adjusted_step:");
+        print_multi_index(adjusted_step.GetIndexDiff());
+        printf("\n");
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+    private:
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+};
+} // namespace cpu
+} // namespace ck
+#endif
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -34,7 +34,7 @@
 #include "is_known_at_compile_time.hpp"
 #include "transpose_vectors.hpp"
 #include "inner_product.hpp"
-#include "element_wise_operation.hpp"
+// #include "element_wise_operation.hpp"
 #include "debug.hpp"
 // TODO: remove this

--- a/include/ck/utility/data_type_cpu.hpp
+++ b/include/ck/utility/data_type_cpu.hpp
+#pragma once
+#include <immintrin.h>
+namespace ck {
+namespace cpu {
+// vector_type
+template <typename T, index_t N>
+struct vector_type;
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+template <typename T, index_t N>
+constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+template <>
+struct vector_type<float, 1>
+{
+    using d1_t = float;
+    // SSE
+    using type = float;
+    type data_;
+    vector_type() : data_{0} {}
+    // vector_type(float x) : data_{x} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{*mem} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = *mem; }
+    constexpr void Store(float* mem) const { *mem = data_; }
+};
+template <>
+struct vector_type<float, 4>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m128;
+    type data_;
+    vector_type() : data_{_mm_setzero_ps()} {}
+    vector_type(float x) : data_{_mm_set1_ps(x)} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
+    constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
+};
+template <>
+struct vector_type<float, 8>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m256;
+    type data_;
+    vector_type() : data_{_mm256_setzero_ps()} {}
+    vector_type(float x) : data_{_mm256_set1_ps(x)} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
+    constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
+};
+template <typename T>
+struct to_vector_type
+{
+    using type = T;
+};
+template <>
+struct to_vector_type<__m128>
+{
+    using type = vector_type<float, 4>;
+};
+template <>
+struct to_vector_type<__m256>
+{
+    using type = vector_type<float, 8>;
+};
+template <typename Tv, typename Tp>
+inline void load_vector(Tv& v, const Tp* mem)
+{
+    v = *reinterpret_cast<const Tv*>(mem);
+}
+template <>
+inline void load_vector(__m128& v, const float* mem)
+{
+    v = _mm_loadu_ps(mem);
+}
+template <>
+inline void load_vector(__m256& v, const float* mem)
+{
+    v = _mm256_loadu_ps(mem);
+}
+template <typename Tv, typename Tp>
+inline void store_vector(const Tv& v, Tp* mem)
+{
+    *reinterpret_cast<Tv*>(mem) = v;
+}
+template <>
+inline void store_vector(const __m128& v, float* mem)
+{
+    _mm_storeu_ps(mem, v);
+}
+template <>
+inline void store_vector(const __m256& v, float* mem)
+{
+    _mm256_storeu_ps(mem, v);
+}
+template <typename Tv, typename Tx>
+inline void set_vector(Tv& v, const Tx x)
+{
+    v = static_cast<const Tv>(x);
+}
+template <>
+inline void set_vector(__m128& v, const float x)
+{
+    v = _mm_set1_ps(x);
+}
+template <>
+inline void set_vector(__m256& v, const float x)
+{
+    v = _mm256_set1_ps(x);
+}
+template <typename Tv>
+inline void clear_vector(Tv& v)
+{
+    v = static_cast<Tv>(0);
+}
+template <>
+inline void clear_vector(__m128& v)
+{
+    v = _mm_setzero_ps();
+}
+template <>
+inline void clear_vector(__m256& v)
+{
+    v = _mm256_setzero_ps();
+}
+using float4_t = typename vector_type<float, 4>::type;
+using float8_t = typename vector_type<float, 8>::type;
+// scalar_type
+template <typename TV>
+struct scalar_type;
+// is_scalar_type
+template <typename TV>
+struct is_scalar_type
+{
+    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
+};
+// has_same_scalar_type
+template <typename X, typename Y>
+using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+template <>
+struct scalar_type<float4_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 4;
+};
+template <>
+struct scalar_type<float8_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 8;
+};
+//
+template <>
+struct scalar_type<float>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 1;
+};
+} // namespace cpu
+} // namespace ck
--- a/include/ck/utility/dynamic_buffer_cpu.hpp
+++ b/include/ck/utility/dynamic_buffer_cpu.hpp
+#ifndef CK_BUFFER_CPU_HPP
+#define CK_BUFFER_CPU_HPP
+#include "config.hpp"
+#include "enable_if.hpp"
+#include "data_type_cpu.hpp"
+namespace ck {
+namespace cpu {
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          typename ElementSpaceSize,
+          bool InvalidElementUseNumericalZeroValue>
+struct DynamicBuffer
+{
+    using type = T;
+    static_assert(BufferAddressSpace ==
+                  AddressSpaceEnum_t::Global); // only valid for global address space on cpu
+    T* p_data_;
+    ElementSpaceSize element_space_size_;
+    T invalid_element_value_ = T{0};
+    constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
+        : p_data_{p_data}, element_space_size_{element_space_size}
+    {
+    }
+    constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size, T invalid_element_value)
+        : p_data_{p_data},
+          element_space_size_{element_space_size},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+    static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; }
+    constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    constexpr T& operator()(index_t i) { return p_data_[i]; }
+    // X should be data_type::type, not directly data_type
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            X v;
+            if(is_valid_element)
+                load_vector(v, &p_data_[i]);
+            else
+                clear_vector(v);
+            return v;
+        }
+        else
+        {
+            X v;
+            if(is_valid_element)
+                load_vector(v, &p_data_[i]);
+            else
+                set_vector(v, invalid_element_value_);
+            return v;
+        }
+    }
+    template <InMemoryDataOperationEnum_t Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum_t::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+        }
+    }
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X need to be multiple T");
+        if(is_valid_element)
+        {
+            store_vector(x, &p_data_[i]);
+        }
+    }
+    static constexpr bool IsStaticBuffer() { return false; }
+    static constexpr bool IsDynamicBuffer() { return true; }
+};
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+}
+template <
+    AddressSpaceEnum_t BufferAddressSpace,
+    typename T,
+    typename ElementSpaceSize,
+    typename X,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+constexpr auto
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+        p, element_space_size, invalid_element_value};
+}
+} // namespace cpu
+} // namespace ck
+#endif
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
 add_subdirectory(src/host_tensor)
 add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/tensor_operation_instance/cpu)
\ No newline at end of file
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU
    DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
    void* GetDeviceBuffer();
    std::size_t GetBufferSize();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
    void SetZero();
    ~DeviceAlignedMemCPU();
@@ -108,4 +110,27 @@ float launch_and_time_kernel(
    return timer.GetElapsedTime() / nrepeat;
 }
+template <typename... Args, typename F>
+void launch_cpu_kernel(F kernel, Args... args)
+{
+    kernel(args...);
+}
+template <typename... Args, typename F>
+float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
+{
+    WallTimer timer;
+    kernel(args...);
+    timer.Start();
+    for(int i = 0; i < nrepeat; i++)
+    {
+        kernel(args...);
+    }
+    timer.End();
+    return timer.GetElapsedTime() / nrepeat;
+}
 #endif
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
 std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
+void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
+void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
 void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
 DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }

--- a/library/src/tensor_operation_instance/cpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/cpu/CMakeLists.txt
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
+    ${PROJECT_SOURCE_DIR}/external/include/half
+)
+function(add_instance_library INSTANCE_NAME)
+    message("adding instance ${INSTANCE_NAME}")
+    add_library(${INSTANCE_NAME} SHARED ${ARGN}) 
+    target_compile_features(${INSTANCE_NAME} PUBLIC)
+    set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction(add_instance_library INSTANCE_NAME)
+add_subdirectory(conv2d_fwd)
--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/CMakeLists.txt
+# device_conv2d_fwd_cpu_instance
+set(DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE
+   device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+)
+add_library(device_conv2d_fwd_cpu_instance SHARED ${DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE}) 
+target_compile_features(device_conv2d_fwd_cpu_instance PUBLIC)
+set_target_properties(device_conv2d_fwd_cpu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_conv2d_fwd_cpu_instance LIBRARY DESTINATION lib) 
+clang_tidy_check(device_conv2d_fwd_cpu_instance)
--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+#include <stdlib.h>
+#include "convolution_forward_specialization_cpu.hpp"
+#include "config.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+using InType                           = float;
+using WeiType                          = float;
+using OutType                          = float;
+using AccType                          = float;
+using InLayout                         = ck::tensor_layout::gemm::RowMajor;    // NHWC
+using WeiLayout                        = ck::tensor_layout::gemm::ColumnMajor; // KYXC
+static constexpr bool NonTemporalStore = false;
+using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+using ThreadwiseGemmAvx2_MxN_4x24_Dispatch =
+    ck::cpu::ThreadwiseGemmAvx2_MxN_4x24_Dispatch<InType,
+                                                  WeiType,
+                                                  OutType,
+                                                  InLayout,
+                                                  WeiLayout,
+                                                  NonTemporalStore>;
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
+using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    //#################################################################|InDataType|WeiDataType|OutDataType|AccDataType|InElementwiseOp|WeiElementwiseOp|OutElementwiseOp|ConvForwardSp|NumDimSpatial|MPerBlock|NPerBlock|KPerBlock|ThreadwiseGemm_Dispatch
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        256,
+        128,
+        64,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        512,
+        256,
+        128,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        1024,
+        144,
+        128,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>>;
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    ck::tensor_operation::device::add_device_operation_instances(
+        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
+}
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -32,6 +32,7 @@ set(PROFILER_SOURCE
    src/profile_conv_fwd_bias_relu.cpp
    src/profile_conv_fwd_bias_relu_add.cpp
    src/profile_conv_fwd_bias_relu_atomic_add.cpp
+    src/profile_conv_fwd_cpu.cpp
    src/profile_convnd_bwd_data.cpp
    src/profile_reduce.cpp
    src/profile_grouped_gemm.cpp
@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance)
 target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)

--- a/profiler/include/profile_conv_fwd_cpu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_cpu_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "reference_conv_fwd.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+#define AVX2_DATA_ALIGNMENT
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_cpu_fwd_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               int nrepeat,
+                               ck::index_t N,
+                               ck::index_t K,
+                               ck::index_t C,
+                               std::vector<ck::index_t> input_spatial_lengths,
+                               std::vector<ck::index_t> filter_spatial_lengths,
+                               std::vector<ck::index_t> output_spatial_lengths,
+                               std::vector<ck::index_t> conv_filter_strides,
+                               std::vector<ck::index_t> conv_filter_dilations,
+                               std::vector<ck::index_t> input_left_pads,
+                               std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+    using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(do_verification)
+    {
+        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                                      WeiDataType,
+                                                                                      OutDataType,
+                                                                                      InElementOp,
+                                                                                      WeiElementOp,
+                                                                                      OutElementOp>;
+        auto ref_conv     = ReferenceConvFwdInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo_host_result,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+    DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
+                                      AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
+                                           out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
+    memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
+    using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using DeviceConvFwdNoOpPtr =
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
+    // add device Conv instances
+    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+    ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
+        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_gflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+            float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(gflops > best_gflops)
+            {
+                best_conv_name  = conv_name;
+                best_gflops     = gflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                memcpy(out_n_k_ho_wo_device_result.mData.data(),
+                       out_device_buf.mpDeviceBuf,
+                       out_device_buf.mMemSize);
+                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+} // namespace profiler
+} // namespace ck