add test threadwise transfer. currently static_ford in threadwise transfer can...

add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size

add test threadwise transfer. currently static_ford in threadwise transfer can...
add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size
c0f698d5 · carlushuang · e6ee6594 · c0f698d5 · c0f698d5 · c0f698d5
Commit c0f698d5 authored Apr 14, 2022 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
+#ifndef CK_BLOCKWISE_GEMM_AVX2_HPP
+#define CK_BLOCKWISE_GEMM_AVX2_HPP
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "threadwise_gemm_avx2.hpp"
+namespace ck {
+namespace cpu {
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AccDataType,
+          typename ABlockDesc,
+          typename BBlockDesc,
+          typename CBlockDesc,
+          typename ABlockSliceLengths,
+          typename BBlockSliceLengths,
+          typename CBlockSliceLengths,
+          typename AThreadSliceLength,
+          typename BThreadSliceLength,
+          ck::index_t AThreadLoopOverDim, // thread slice loop over on block slice. 1d is enough for
+                                          // now
+          ck::index_t BThreadLoopOverDim,
+          ck::index_t KPerBlock,
+          typename ThreadwiseGemm_Dispatch,
+          typename ThreadMNAccessOrder // how we acces gemm MN to utilize micro kernel
+          >
+struct BlockwiseGemmAvx2_MxN
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr index_t nDimA = ABlockDesc::GetNumOfDimension();
+    static constexpr index_t nDimB = BBlockDesc::GetNumOfDimension();
+    static constexpr index_t nDimC = CBlockDesc::GetNumOfDimension();
+    using IndexA                   = MultiIndex<nDimA>;
+    using IndexB                   = MultiIndex<nDimB>;
+    using IndexC                   = MultiIndex<nDimC>;
+    using ACoord = decltype(make_tensor_coordinate(ABlockDesc{}, IndexA{}));
+    using BCoord = decltype(make_tensor_coordinate(BBlockDesc{}, IndexB{}));
+    using CCoord = decltype(make_tensor_coordinate(CBlockDesc{}, IndexC{}));
+#if 0
+    constexpr BlockwiseGemmAvx2_MxN(const ABlockDesc & a_block_desc, const IndexA& a_thread_origin,
+                                    const BBlockDesc & b_block_desc, const IndexB& b_thread_origin)
+        : a_thread_coord_(make_tensor_coordinate(a_block_desc, a_thread_origin)),
+          b_thread_coord_(make_tensor_coordinate(b_block_desc, b_thread_origin)),
+    {
+    }
+#endif
+    template <typename TensorDesc>
+    constexpr auto GetLeadingElement(const TensorDesc& desc)
+    {
+        // if use this function, make sure desc are known at compile time.
+        // otherwise, it is not efficient to calculate leading dim here
+        if constexpr(TensorDesc::GetNumOfDimension() == 1)
+        {
+            return 1;
+        }
+        else
+        {
+            constexpr auto last_dims =
+                typename uniform_sequence_gen<TensorDesc::GetNumOfDimension() - 1, 0>::type{};
+            constexpr auto lead_dims = decltype(last_dims)::PushFront(Number<1>{});
+            return desc.CalculateOffset(lead_dims);
+        }
+    }
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CBlockBuffer>
+    void Run(const ABlockDesc& a_block_desc,
+             const ABlockBuffer& a_block_buf,
+             const IndexA& a_origin,
+             const BBlockDesc& b_block_desc,
+             const BBlockBuffer& b_block_buf,
+             const IndexB& b_origin,
+             const CBlockDesc& c_block_desc,
+             CBlockBuffer& c_block_buf,
+             const IndexC& c_origin) const
+    {
+        constexpr auto m_n_block_length =
+            ck::Sequence<ABlockSliceLengths::At(AThreadLoopOverDim),
+                         BBlockSliceLengths::At(BThreadLoopOverDim)>{};
+        constexpr auto m_n_thread_length =
+            ck::Sequence<AThreadSliceLength::At(AThreadLoopOverDim),
+                         BThreadSliceLength::At(BThreadLoopOverDim)>{};
+        constexpr auto m_n_access_length = m_n_block_length / m_n_thread_length;
+        constexpr auto ordered_m_n_access_length =
+            container_reorder_given_new2old(m_n_access_length, ThreadMNAccessOrder{});
+        constexpr auto a_block_idx_zeros =
+            typename uniform_sequence_gen<nDimA, 0>::type{}; // starting point of the block
+        constexpr auto b_block_idx_zeros = typename uniform_sequence_gen<nDimB, 0>::type{};
+        constexpr auto lda = GetLeadingElement(a_block_desc) * sizeof(FloatA);
+        constexpr auto ldb = GetLeadingElement(b_block_desc) * sizeof(FloatB);
+        constexpr auto ldc = GetLeadingElement(c_block_desc) * sizeof(FloatC);
+        ck::cpu::ThreadwiseGemmParam param;
+        param.Kr    = KPerBlock;
+        param.lda   = lda;
+        param.ldb   = ldb;
+        param.ldc   = ldc;
+        param.alpha = 1.0f; // TODO
+        static_ford<decltype(ordered_m_n_access_length)>{}([&](auto ordered_idx) {
+            constexpr auto origin_m_n_idx = ordered_idx.ReorderGivenOld2New(ThreadMNAccessOrder{});
+            constexpr auto current_m_idx =
+                origin_m_n_idx.At(0) * AThreadSliceLength::At(AThreadLoopOverDim);
+            constexpr auto current_n_idx =
+                origin_m_n_idx.At(1) * BThreadSliceLength::At(BThreadLoopOverDim);
+            constexpr auto current_mr =
+                ck::math::min(m_n_block_length.At(0) - current_m_idx, m_n_thread_length.At(0));
+            constexpr auto current_nr =
+                ck::math::min(m_n_block_length.At(1) - current_n_idx, m_n_thread_length.At(1));
+            constexpr auto a_block_idx =
+                a_block_idx_zeros.Modify(AThreadLoopOverDim, current_m_idx);
+            constexpr auto a_block_coord =
+                make_tensor_coordinate(a_block_desc, to_multi_index(a_origin + a_block_idx));
+            constexpr auto b_block_idx =
+                b_block_idx_zeros.Modify(BThreadLoopOverDim, current_n_idx);
+            constexpr auto b_block_coord =
+                make_tensor_coordinate(b_block_desc, to_multi_index(b_origin + b_block_idx));
+            constexpr auto c_block_coord =
+                make_tensor_coordinate(c_block_desc, to_multi_index(c_origin + origin_m_n_idx));
+            param.p_a = &a_block_buf.p_data_[a_block_coord.GetOffset()];
+            param.p_b = &b_block_buf.p_data_[b_block_coord.GetOffset()];
+            param.p_c = &c_block_buf.p_data_[c_block_coord.GetOffset()];
+            ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
+        });
+    }
+};
+} // namespace cpu
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/convolution_forward_specialization_cpu.hpp
+#ifndef CONVOLUTION_FORWARD_SPECIALIZATION_CPU
+#define CONVOLUTION_FORWARD_SPECIALIZATION_CPU
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+enum ConvolutionForwardSpecialization_t
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    OddC,
+};
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_base_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_base_cpu.hpp
+#ifndef DEVICE_BASE_CPU_HPP
+#define DEVICE_BASE_CPU_HPP
+#include <string>
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+struct BaseArgument
+{
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
+    BaseArgument& operator=(const BaseArgument&) = default;
+    virtual ~BaseArgument() {}
+};
+struct BaseInvoker
+{
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker& operator=(const BaseInvoker&) = default;
+    virtual float Run(const BaseArgument*, int = 1) = 0;
+    virtual ~BaseInvoker() {}
+};
+struct BaseOperator
+{
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
+    BaseOperator& operator=(const BaseOperator&) = default;
+    virtual bool IsSupportedArgument(const BaseArgument*) = 0;
+    virtual std::string GetTypeString() const             = 0;
+    virtual ~BaseOperator() {}
+};
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_conv_fwd_cpu.hpp
+#ifndef DEVICE_CONV_FWD_CPU_HPP
+#define DEVICE_CONV_FWD_CPU_HPP
+#include <iostream>
+#include "device_base_cpu.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+using DeviceConvFwdPtr = std::unique_ptr<
+    DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
--- a/include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
+++ b/include/ck/tensor_operation/cpu/element/element_wise_operation_cpu.hpp
+#pragma once
+#include "data_type_cpu.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace element_wise {
+using float8_t = ck::cpu::float8_t;
+using float4_t = ck::cpu::float4_t;
+struct PassThrough
+{
+    void operator()(float& y, const float& x) const { y = x; }
+    void operator()(float4_t& y, const float4_t& x) const { y = x; }
+    void operator()(float8_t& y, const float8_t& x) const { y = x; }
+};
+struct Add
+{
+    void operator()(float& y, const float& x0, const float& x1) const { y = x0 + x1; }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_add_ps(x0, x1);
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_add_ps(x0, x1);
+    }
+};
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        y = alpha_ * x0 + beta_ * x1;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_add_ps(_mm_mul_ps(x0, _mm_set1_ps(alpha_)), _mm_mul_ps(x1, _mm_set1_ps(beta_)));
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_add_ps(_mm256_mul_ps(x0, _mm256_set1_ps(alpha_)),
+                          _mm256_mul_ps(x1, _mm256_set1_ps(beta_)));
+    }
+    float alpha_;
+    float beta_;
+};
+struct AddRelu
+{
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > 0 ? a : 0;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1) const
+    {
+        y = _mm_max_ps(_mm_add_ps(x0, x1), _mm_setzero_ps());
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1) const
+    {
+        y = _mm256_max_ps(_mm256_add_ps(x0, x1), _mm256_setzero_ps());
+    }
+};
+#if 0
+struct AddHardswish
+{
+    void operator()(float& y, const float& x0, const float& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        y       = c;
+    }
+    void
+    operator()(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        y       = c;
+    }
+};
+#endif
+struct AddReluAdd
+{
+    void operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+    void operator()(float4_t& y, const float4_t& x0, const float4_t& x1, const float4_t& x2) const
+    {
+        float4_t a = _mm_add_ps(x0, x1);
+        float4_t b = _mm_max_ps(a, _mm_setzero_ps());
+        y          = _mm_add_ps(b, x2);
+    }
+    void operator()(float8_t& y, const float8_t& x0, const float8_t& x1, const float8_t& x2) const
+    {
+        float8_t a = _mm256_add_ps(x0, x1);
+        float8_t b = _mm256_max_ps(a, _mm256_setzero_ps());
+        y          = _mm256_add_ps(b, x2);
+    }
+};
+#if 0
+struct AddHardswishAdd
+{
+    void
+    operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+    void
+    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+};
+#endif
+#if 0
+struct RequantReluRequant
+{
+    // FIXME: We just need one scale for Relu / Leaky Relu / PRelu
+    RequantReluRequant(float scaleGemm, float scaleRelu)
+        : scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
+    {
+    }
+    void operator()(int8_t& y, const int& x) const
+    {
+        float gemm_requant = scaleGemm_ * static_cast<float>(x);
+        float relu         = gemm_requant > 0 ? gemm_requant : 0;
+        float relu_requant = scaleRelu_ * relu;
+        y                  = static_cast<int8_t>(relu_requant > 127 ? 127
+                                                   : relu_requant < -128 ? -128 : relu_requant);
+    }
+    // for reference_gemm
+    void operator()(float& y, const float& x) const
+    {
+        float gemm_requant = scaleGemm_ * x;
+        float relu         = gemm_requant > 0 ? gemm_requant : 0;
+        float relu_requant = scaleRelu_ * relu;
+        y                  = static_cast<float>(relu_requant > 127 ? 127
+                                                  : relu_requant < -128 ? -128 : relu_requant);
+    }
+    float scaleGemm_;
+    float scaleRelu_;
+};
+#endif
+// Unary operators are usually called element-wisely before/after the reduction is executed on the
+// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
+template <typename Y, typename X, bool HasDividing = false>
+struct UnaryIdentic;
+template <>
+struct UnaryIdentic<float, float, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float, float, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float& y, const float& x) const { y = x / type_convert<float>(divider_); };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnaryIdentic<float4_t, float4_t, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float4_t, float4_t, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        y = _mm_div_ps(x, _mm_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnaryIdentic<float8_t, float8_t, false>
+{
+    UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const { y = x; };
+};
+template <>
+struct UnaryIdentic<float8_t, float8_t, true>
+{
+    UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        y = _mm256_div_ps(x, _mm256_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <typename Y, typename X, bool HasDividing = false>
+struct UnarySquare;
+template <>
+struct UnarySquare<float, float, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = x * x; };
+};
+template <>
+struct UnarySquare<float, float, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float& y, const float& x) const { y = x * x / type_convert<float>(divider_); };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnarySquare<float4_t, float4_t, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const { y = _mm_mul_ps(x, x); };
+};
+template <>
+struct UnarySquare<float4_t, float4_t, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        y = _mm_div_ps(_mm_mul_ps(x, x), _mm_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <>
+struct UnarySquare<float8_t, float8_t, false>
+{
+    UnarySquare(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const { y = _mm256_mul_ps(x, x); };
+};
+template <>
+struct UnarySquare<float8_t, float8_t, true>
+{
+    UnarySquare(const int32_t divider = 1) { divider_ = divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        y = _mm256_div_ps(_mm256_mul_ps(x, x), _mm256_set1_ps(static_cast<float>(divider_)));
+    };
+    int32_t divider_ = 1;
+};
+template <typename Y, typename X>
+struct UnaryAbs;
+template <>
+struct UnaryAbs<float, float>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float& y, const float& x) const { y = abs(x); };
+};
+template <>
+struct UnaryAbs<float4_t, float4_t>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float4_t& y, const float4_t& x) const
+    {
+        __m128 Mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
+        y           = _mm_and_ps(Mask, x);
+    };
+};
+template <>
+struct UnaryAbs<float8_t, float8_t>
+{
+    UnaryAbs(const int32_t divider = 1) { (void)divider; };
+    void operator()(float8_t& y, const float8_t& x) const
+    {
+        __m256 Mask = _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));
+        y           = _mm256_and_ps(Mask, x);
+    };
+};
+template <typename Y, typename X>
+struct UnarySqrt;
+template <>
+struct UnarySqrt<float, float>
+{
+    void operator()(float& y, const float& x) const { y = sqrtf(x); };
+};
+template <>
+struct UnarySqrt<float4_t, float4_t>
+{
+    void operator()(float4_t& y, const float4_t& x) const { y = _mm_sqrt_ps(x); };
+};
+template <>
+struct UnarySqrt<float8_t, float8_t>
+{
+    void operator()(float8_t& y, const float8_t& x) const { y = _mm256_sqrt_ps(x); };
+};
+} // namespace element_wise
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2.hpp
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_AVX2_HPP
+#include "common_header.hpp"
+#include "data_type_cpu.hpp"
+#include "../../gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"
+#include "dynamic_buffer_cpu.hpp"
+#include <immintrin.h>
+namespace ck {
+namespace cpu {
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   3. always use __mm256 register to hold continuous 8 dword, so if fast-changing
+//      dim is a complex dimension, better re-consider layout (e.g NCHW is not good if non 1x1)
+//   4. RunGeneric() can handle any case (by not using ymm), but performance are not guranteed
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector, // src/dst must use same vector size, aka src/dst both need same
+                                   // avx/float register
+          InMemoryDataOperationEnum_t DstInMemOp,
+          bool SrcResetCoordinateAfterRun,
+          bool DstResetCoordinateAfterRun>
+struct ThreadwiseTensorSliceTransferAvx2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+    static constexpr auto I0 = Number<0>{};
+    constexpr ThreadwiseTensorSliceTransferAvx2(const SrcDesc& src_desc,
+                                                const Index& src_slice_origin,
+                                                const DstDesc& dst_desc,
+                                                const Index& dst_slice_origin,
+                                                const ElementwiseOperation& element_op)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+    void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        // In GPU this function is used for set per-thread index based on threadIdx.x
+        // But for CPU, no need to call this function.
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+    void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+    template <typename SrcBuffer, typename DstBuffer>
+    void RunGeneric(const SrcDesc& src_desc,
+                    const SrcBuffer& src_buf,
+                    const DstDesc& dst_desc,
+                    DstBuffer& dst_buf)
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto scalar_per_access = generate_sequence(
+            ck::detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+        // loop over space-filling curve
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        // std::cout<<"num_access:"<<num_access<<std::endl;
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            using src_vector_type = ck::cpu::vector_type_maker_t<SrcData, ScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+            using dst_vector_type = ck::cpu::vector_type_maker_t<DstData, ScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+            // printf("[%s] ", is_src_valid ? "y":"n");
+            // print_multi_index(src_coord_.GetIndex());
+            // printf("----");
+            // print_multi_index(src_coord_.GetHiddenIndex());
+            // printf(":%d", src_coord_.GetOffset());
+            // printf("\n");
+            // copy data from src_buf into src_vector_container
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+            auto dst_vector_container = dst_vector_type{};
+            // apply pointwise operation
+            // static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+            //     element_op_(dst_vector_container.template AsType<DstData>()(i),
+            //                 src_vector_container.template AsType<SrcData>()[i]);
+            // });
+            element_op_(dst_vector_container.template AsType<dst_vector_t>(),
+                        src_vector_container.template AsType<src_vector_t>());
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+            // printf(" -> ");
+            // print_multi_index(dst_coord_.GetIndex());
+            // printf(":%d", dst_coord_.GetOffset());
+            // printf(", src:0x%x, dst:0x%x",
+            // *reinterpret_cast<uint32_t*>(&src_vector_container.template AsType<src_vector_t>()),
+            //                               *reinterpret_cast<uint32_t*>(&dst_vector_container.template
+            //                               AsType<dst_vector_t>()));
+            // printf("\n");
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>());
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+        // move coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+    static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+            return reset_step;
+        }
+    }
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateResetStep();
+        printf(" GetCoordinateResetStep:");
+        print_multi_index(GetCoordinateResetStep());
+        printf(" adjusted_step_idx:");
+        print_multi_index(adjusted_step_idx);
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+        printf(" adjusted_step:");
+        print_multi_index(adjusted_step.GetIndexDiff());
+        printf("\n");
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+    private:
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+};
+} // namespace cpu
+} // namespace ck
+#endif
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -34,7 +34,7 @@
 #include "is_known_at_compile_time.hpp"
 #include "transpose_vectors.hpp"
 #include "inner_product.hpp"
-#include "element_wise_operation.hpp"
+// #include "element_wise_operation.hpp"
 #include "debug.hpp"
 // TODO: remove this

--- a/include/ck/utility/data_type_cpu.hpp
+++ b/include/ck/utility/data_type_cpu.hpp
+#pragma once
+#include <immintrin.h>
+namespace ck {
+namespace cpu {
+// vector_type
+template <typename T, index_t N>
+struct vector_type;
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+template <typename T, index_t N>
+constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+template <>
+struct vector_type<float, 1>
+{
+    using d1_t = float;
+    // SSE
+    using type = float;
+    type data_;
+    vector_type() : data_{0} {}
+    // vector_type(float x) : data_{x} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{*mem} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = *mem; }
+    constexpr void Store(float* mem) const { *mem = data_; }
+};
+template <>
+struct vector_type<float, 4>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m128;
+    type data_;
+    vector_type() : data_{_mm_setzero_ps()} {}
+    vector_type(float x) : data_{_mm_set1_ps(x)} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
+    constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
+};
+template <>
+struct vector_type<float, 8>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m256;
+    type data_;
+    vector_type() : data_{_mm256_setzero_ps()} {}
+    vector_type(float x) : data_{_mm256_set1_ps(x)} {}
+    vector_type(type v) : data_{v} {}
+    vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+        return data_;
+    }
+    constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
+    constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
+};
+template <typename T>
+struct to_vector_type
+{
+    using type = T;
+};
+template <>
+struct to_vector_type<__m128>
+{
+    using type = vector_type<float, 4>;
+};
+template <>
+struct to_vector_type<__m256>
+{
+    using type = vector_type<float, 8>;
+};
+template <typename Tv, typename Tp>
+inline void load_vector(Tv& v, const Tp* mem)
+{
+    v = *reinterpret_cast<const Tv*>(mem);
+}
+template <>
+inline void load_vector(__m128& v, const float* mem)
+{
+    v = _mm_loadu_ps(mem);
+}
+template <>
+inline void load_vector(__m256& v, const float* mem)
+{
+    v = _mm256_loadu_ps(mem);
+}
+template <typename Tv, typename Tp>
+inline void store_vector(const Tv& v, Tp* mem)
+{
+    *reinterpret_cast<Tv*>(mem) = v;
+}
+template <>
+inline void store_vector(const __m128& v, float* mem)
+{
+    _mm_storeu_ps(mem, v);
+}
+template <>
+inline void store_vector(const __m256& v, float* mem)
+{
+    _mm256_storeu_ps(mem, v);
+}
+template <typename Tv, typename Tx>
+inline void set_vector(Tv& v, const Tx x)
+{
+    v = static_cast<const Tv>(x);
+}
+template <>
+inline void set_vector(__m128& v, const float x)
+{
+    v = _mm_set1_ps(x);
+}
+template <>
+inline void set_vector(__m256& v, const float x)
+{
+    v = _mm256_set1_ps(x);
+}
+template <typename Tv>
+inline void clear_vector(Tv& v)
+{
+    v = static_cast<Tv>(0);
+}
+template <>
+inline void clear_vector(__m128& v)
+{
+    v = _mm_setzero_ps();
+}
+template <>
+inline void clear_vector(__m256& v)
+{
+    v = _mm256_setzero_ps();
+}
+using float4_t = typename vector_type<float, 4>::type;
+using float8_t = typename vector_type<float, 8>::type;
+// scalar_type
+template <typename TV>
+struct scalar_type;
+// is_scalar_type
+template <typename TV>
+struct is_scalar_type
+{
+    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
+};
+// has_same_scalar_type
+template <typename X, typename Y>
+using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+template <>
+struct scalar_type<float4_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 4;
+};
+template <>
+struct scalar_type<float8_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 8;
+};
+//
+template <>
+struct scalar_type<float>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 1;
+};
+} // namespace cpu
+} // namespace ck
--- a/include/ck/utility/dynamic_buffer_cpu.hpp
+++ b/include/ck/utility/dynamic_buffer_cpu.hpp
+#ifndef CK_BUFFER_CPU_HPP
+#define CK_BUFFER_CPU_HPP
+#include "config.hpp"
+#include "enable_if.hpp"
+#include "data_type_cpu.hpp"
+namespace ck {
+namespace cpu {
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          typename ElementSpaceSize,
+          bool InvalidElementUseNumericalZeroValue>
+struct DynamicBuffer
+{
+    using type = T;
+    static_assert(BufferAddressSpace ==
+                  AddressSpaceEnum_t::Global); // only valid for global address space on cpu
+    T* p_data_;
+    ElementSpaceSize element_space_size_;
+    T invalid_element_value_ = T{0};
+    constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
+        : p_data_{p_data}, element_space_size_{element_space_size}
+    {
+    }
+    constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size, T invalid_element_value)
+        : p_data_{p_data},
+          element_space_size_{element_space_size},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+    static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; }
+    constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    constexpr T& operator()(index_t i) { return p_data_[i]; }
+    // X should be data_type::type, not directly data_type
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            X v;
+            if(is_valid_element)
+                load_vector(v, &p_data_[i]);
+            else
+                clear_vector(v);
+            return v;
+        }
+        else
+        {
+            X v;
+            if(is_valid_element)
+                load_vector(v, &p_data_[i]);
+            else
+                set_vector(v, invalid_element_value_);
+            return v;
+        }
+    }
+    template <InMemoryDataOperationEnum_t Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum_t::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+        }
+    }
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X need to be multiple T");
+        if(is_valid_element)
+        {
+            store_vector(x, &p_data_[i]);
+        }
+    }
+    static constexpr bool IsStaticBuffer() { return false; }
+    static constexpr bool IsDynamicBuffer() { return true; }
+};
+template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+}
+template <
+    AddressSpaceEnum_t BufferAddressSpace,
+    typename T,
+    typename ElementSpaceSize,
+    typename X,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+constexpr auto
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+        p, element_space_size, invalid_element_value};
+}
+} // namespace cpu
+} // namespace ck
+#endif
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
 add_subdirectory(src/host_tensor)
 add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/tensor_operation_instance/cpu)
\ No newline at end of file
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -29,6 +29,8 @@ struct DeviceAlignedMemCPU
    DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
    void* GetDeviceBuffer();
    std::size_t GetBufferSize();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
    void SetZero();
    ~DeviceAlignedMemCPU();
@@ -108,4 +110,27 @@ float launch_and_time_kernel(
    return timer.GetElapsedTime() / nrepeat;
 }
+template <typename... Args, typename F>
+void launch_cpu_kernel(F kernel, Args... args)
+{
+    kernel(args...);
+}
+template <typename... Args, typename F>
+float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
+{
+    WallTimer timer;
+    kernel(args...);
+    timer.Start();
+    for(int i = 0; i < nrepeat; i++)
+    {
+        kernel(args...);
+    }
+    timer.End();
+    return timer.GetElapsedTime() / nrepeat;
+}
 #endif
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -45,6 +45,10 @@ void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
 std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
+void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
+void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
 void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
 DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }

--- a/library/src/tensor_operation_instance/cpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/cpu/CMakeLists.txt
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
+    ${PROJECT_SOURCE_DIR}/external/include/half
+)
+function(add_instance_library INSTANCE_NAME)
+    message("adding instance ${INSTANCE_NAME}")
+    add_library(${INSTANCE_NAME} SHARED ${ARGN}) 
+    target_compile_features(${INSTANCE_NAME} PUBLIC)
+    set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction(add_instance_library INSTANCE_NAME)
+add_subdirectory(conv2d_fwd)
--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/CMakeLists.txt
+# device_conv2d_fwd_cpu_instance
+set(DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE
+   device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+)
+add_library(device_conv2d_fwd_cpu_instance SHARED ${DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE}) 
+target_compile_features(device_conv2d_fwd_cpu_instance PUBLIC)
+set_target_properties(device_conv2d_fwd_cpu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_conv2d_fwd_cpu_instance LIBRARY DESTINATION lib) 
+clang_tidy_check(device_conv2d_fwd_cpu_instance)
--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+#include <stdlib.h>
+#include "convolution_forward_specialization_cpu.hpp"
+#include "config.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+using InType                           = float;
+using WeiType                          = float;
+using OutType                          = float;
+using AccType                          = float;
+using InLayout                         = ck::tensor_layout::gemm::RowMajor;    // NHWC
+using WeiLayout                        = ck::tensor_layout::gemm::ColumnMajor; // KYXC
+static constexpr bool NonTemporalStore = false;
+using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+using ThreadwiseGemmAvx2_MxN_4x24_Dispatch =
+    ck::cpu::ThreadwiseGemmAvx2_MxN_4x24_Dispatch<InType,
+                                                  WeiType,
+                                                  OutType,
+                                                  InLayout,
+                                                  WeiLayout,
+                                                  NonTemporalStore>;
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
+using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    //#################################################################|InDataType|WeiDataType|OutDataType|AccDataType|InElementwiseOp|WeiElementwiseOp|OutElementwiseOp|ConvForwardSp|NumDimSpatial|MPerBlock|NPerBlock|KPerBlock|ThreadwiseGemm_Dispatch
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        256,
+        128,
+        64,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        512,
+        256,
+        128,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>,
+    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        float,
+        float,
+        float,
+        float,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        ConvFwdDefault,
+        2,
+        1024,
+        144,
+        128,
+        ThreadwiseGemmAvx2_MxN_4x24_Dispatch>>;
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    ck::tensor_operation::device::add_device_operation_instances(
+        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
+}
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -32,6 +32,7 @@ set(PROFILER_SOURCE
    src/profile_conv_fwd_bias_relu.cpp
    src/profile_conv_fwd_bias_relu_add.cpp
    src/profile_conv_fwd_bias_relu_atomic_add.cpp
+    src/profile_conv_fwd_cpu.cpp
    src/profile_convnd_bwd_data.cpp
    src/profile_reduce.cpp
    src/profile_grouped_gemm.cpp
@@ -51,6 +52,7 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_cpu_instance)
 target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)

--- a/profiler/include/profile_conv_fwd_cpu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_cpu_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "reference_conv_fwd.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+#define AVX2_DATA_ALIGNMENT
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_cpu_fwd_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               int nrepeat,
+                               ck::index_t N,
+                               ck::index_t K,
+                               ck::index_t C,
+                               std::vector<ck::index_t> input_spatial_lengths,
+                               std::vector<ck::index_t> filter_spatial_lengths,
+                               std::vector<ck::index_t> output_spatial_lengths,
+                               std::vector<ck::index_t> conv_filter_strides,
+                               std::vector<ck::index_t> conv_filter_dilations,
+                               std::vector<ck::index_t> input_left_pads,
+                               std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+    using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+    if(do_verification)
+    {
+        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                                      WeiDataType,
+                                                                                      OutDataType,
+                                                                                      InElementOp,
+                                                                                      WeiElementOp,
+                                                                                      OutElementOp>;
+        auto ref_conv     = ReferenceConvFwdInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo_host_result,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+    DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
+                                      AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
+                                           out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
+    memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
+    using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using DeviceConvFwdNoOpPtr =
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
+    // add device Conv instances
+    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+    ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
+        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_gflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+            float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(gflops > best_gflops)
+            {
+                best_conv_name  = conv_name;
+                best_gflops     = gflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                memcpy(out_n_k_ho_wo_device_result.mData.data(),
+                       out_device_buf.mpDeviceBuf,
+                       out_device_buf.mMemSize);
+                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+} // namespace profiler
+} // namespace ck