rename to example

505194d7 · carlushuang · ffb13372 · 505194d7 · 505194d7 · 505194d7
Commit 505194d7 authored May 18, 2022 by carlushuang
7 changed files
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -10,6 +10,11 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
@@ -51,3 +56,5 @@ add_subdirectory(17_convnd_bwd_data_xdl)
 add_subdirectory(15_grouped_gemm)
 add_subdirectory(16_gemm_reduce)
 add_subdirectory(18_batched_gemm_reduce)
+
+add_subdirectory(cpu_01_conv2d_fwd)
--- a/example/cpu_01_conv2d_fwd/CMakeLists.txt
+++ b/example/cpu_01_conv2d_fwd/CMakeLists.txt
+add_example_executable(example_cpu_conv2d_fwd cpu_conv2d_fwd.cpp)
+target_link_libraries(example_cpu_conv2d_fwd PRIVATE device_conv2d_fwd_cpu_instance)
+
+set_target_properties(example_cpu_conv2d_fwd PROPERTIES LINK_FLAGS "${OMP_LINK_FLAG}")
+target_link_libraries(example_cpu_conv2d_fwd PRIVATE "${OMP_LIBRARY}")
+target_compile_options(example_cpu_conv2d_fwd PRIVATE "${OMP_CXX_FLAG}")
--- a/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+++ b/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
-#include <sstream>
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "tensor_layout.hpp"
-#include "device_tensor.hpp"
-#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation_cpu.hpp"
-#include "reference_conv_fwd.hpp"
-#include "element_wise_operation_cpu.hpp"
-#include "dynamic_buffer_cpu.hpp"
-#include <omp.h>
-
-#define AVX2_DATA_ALIGNMENT 32
-
-#define TEST_FUSION_PASSTHROUGH 0
-#define TEST_FUSION_RELU 1
-#define TEST_FUSION TEST_FUSION_PASSTHROUGH
-
-#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
-#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
-#define TEST_LAYOUT TEST_LAYOUT_NHWC_KYXCK8_NHWK
-
-using F32 = float;
-using F16 = ck::half_t;
-
-namespace ck {
-namespace tensor_operation {
-namespace cpu {
-namespace device {
-namespace device_conv2d_fwd_avx2_instance {
-
-using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
-using Relu        = ck::tensor_operation::cpu::element_wise::Relu;
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
-
-} // namespace device_conv2d_fwd_avx2_instance
-} // namespace device
-} // namespace cpu
-} // namespace tensor_operation
-} // namespace ck
-
-using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
-#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
-using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
-#endif
-#if TEST_FUSION == TEST_FUSION_RELU
-using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
-#endif
-
-template <typename T>
-static bool
-check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
-{
-    int error_count = 0;
-    float max_diff  = 1e-5;
-
-    double square_difference = .0;
-    double mag1              = .0;
-    double mag2              = .0;
-
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        double ri = (double)ref.mData[i];
-        double pi = (double)result.mData[i];
-        double d  = ri - pi;
-
-        if(per_pixel_check)
-        {
-            if(max_diff < std::abs(d))
-            {
-                error_count++;
-                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
-                       i,
-                       double(ref.mData[i]),
-                       double(result.mData[i]),
-                       d);
-            }
-        }
-
-        square_difference += d * d;
-        if(std::abs(mag1) < std::abs(ri))
-            mag1 = ri;
-        if(std::abs(mag2) < std::abs(pi))
-            mag2 = pi;
-    }
-
-    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
-    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
-
-    if(computed_nrms >= nrms)
-        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
-               computed_nrms,
-               mag1,
-               mag2,
-               nrms);
-
-    return computed_nrms < nrms && error_count == 0;
-}
-
-float calculate_gflops() {}
-
-template <typename T>
-void transpose_kyxc_2_kyxc8k(Tensor<T>& dst,
-                             const Tensor<T>& src,
-                             ck::index_t K,
-                             ck::index_t Y,
-                             ck::index_t X,
-                             ck::index_t C)
-{
-    ck::index_t batch = K / 8;
-    ck::index_t row   = 8;
-    ck::index_t col   = C * Y * X;
-    for(auto i_b = 0; i_b < batch; i_b++)
-    {
-        for(auto i_r = 0; i_r < row; i_r++)
-        {
-            for(auto i_c = 0; i_c < col; i_c++)
-            {
-                ck::index_t src_idx = i_b * row * col + i_r * col + i_c;
-                ck::index_t dst_idx = i_b * col * row + i_c * row + i_r;
-                dst.mData[dst_idx]  = src.mData[src_idx];
-            }
-        }
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int data_type   = 0;
-    int init_method = 0;
-
-    // Conv shape
-    ck::index_t N               = 2;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 1;
-    ck::index_t conv_stride_w   = 1;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    if(argc == 1)
-    {
-        data_type   = 0;
-        init_method = 1;
-    }
-    else if(argc == 3)
-    {
-        data_type   = std::stoi(argv[1]);
-        init_method = std::stoi(argv[2]);
-    }
-    else if(argc == 18)
-    {
-        data_type   = std::stoi(argv[1]);
-        init_method = std::stoi(argv[2]);
-
-        N               = std::stoi(argv[3]);
-        K               = std::stoi(argv[4]);
-        C               = std::stoi(argv[5]);
-        Y               = std::stoi(argv[6]);
-        X               = std::stoi(argv[7]);
-        Hi              = std::stoi(argv[8]);
-        Wi              = std::stoi(argv[9]);
-        conv_stride_h   = std::stoi(argv[10]);
-        conv_stride_w   = std::stoi(argv[11]);
-        conv_dilation_h = std::stoi(argv[12]);
-        conv_dilation_w = std::stoi(argv[13]);
-        in_left_pad_h   = std::stoi(argv[14]);
-        in_left_pad_w   = std::stoi(argv[15]);
-        in_right_pad_h  = std::stoi(argv[16]);
-        in_right_pad_w  = std::stoi(argv[17]);
-    }
-    else
-    {
-        printf("arg1: data type (0=fp32, 1=fp16)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-
-    auto Run = [&](auto input_type, auto wei_type, auto out_type) {
-        using InDataType  = decltype(input_type);
-        using WeiDataType = decltype(wei_type);
-        using OutDataType = decltype(out_type);
-
-        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
-                                                                                      WeiDataType,
-                                                                                      OutDataType,
-                                                                                      InElementOp,
-                                                                                      WeiElementOp,
-                                                                                      OutElementOp>;
-
-        const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
-        const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-        const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-        const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-
-        const std::vector<ck::index_t> input_spatial_lengths{{Hi, Wi}};
-        const std::vector<ck::index_t> filter_spatial_lengths{{Y, X}};
-        const std::vector<ck::index_t> output_spatial_lengths{{Ho, Wo}};
-        const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
-        const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
-        const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
-        const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
-
-        auto f_host_tensor_descriptor = [](std::size_t N_,
-                                           std::size_t C_,
-                                           std::size_t H_,
-                                           std::size_t W_) {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H_, W_}),
-                                        std::vector<std::size_t>({C_ * H_ * W_, 1, W_ * C_, C_}));
-        };
-
-        Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
-        Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
-#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
-        Tensor<WeiDataType> wei_k_c_y_x_k8(
-            f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
-#endif
-        Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-        Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-
-        std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
-        std::cout << "wei(K, C,  Y,  X): " << wei_k_c_y_x.mDesc << std::endl;
-        std::cout << "out(N, K, Ho, Wo): " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-        std::cout << "LPad(H, W):" << in_left_pad_h << "," << in_left_pad_w
-                  << ", RPad(H, W):" << in_right_pad_h << "," << in_right_pad_w
-                  << ", Stride(H, W):" << conv_stride_h << ", " << conv_stride_w
-                  << ", Dilation(H, W):" << conv_dilation_h << ", " << conv_dilation_w
-                  << ", Threads:" << omp_get_max_threads() << std::endl;
-
-        int per_pixel_check = 0;
-        switch(init_method)
-        {
-        case 0:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
-            per_pixel_check = 1;
-            break;
-        case 1:
-
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-            // in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-            // wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
-            per_pixel_check = 1;
-            break;
-
-        case 2:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-            break;
-
-        case 3:
-
-#define PACK_32(v24, v16, v8, v0) \
-    (((v24 & 0xff) << 24) | ((v16 & 0xff) << 16) | ((v8 & 0xff) << 8) | ((v0 & 0xff) << 0))
-
-            for(auto i_n = 0; i_n < N; i_n++)
-            {
-                for(auto i_c = 0; i_c < C; i_c++)
-                {
-                    for(auto i_hi = 0; i_hi < Hi; i_hi++)
-                    {
-                        for(auto i_wi = 0; i_wi < Wi; i_wi++)
-                        {
-                            uint32_t v                         = PACK_32(i_n, i_c, i_hi, i_wi);
-                            in_n_c_hi_wi(i_n, i_c, i_hi, i_wi) = *reinterpret_cast<float*>(&v);
-                        }
-                    }
-                }
-            }
-
-            for(auto i_k = 0; i_k < K; i_k++)
-            {
-                for(auto i_c = 0; i_c < C; i_c++)
-                {
-                    for(auto i_y = 0; i_y < Y; i_y++)
-                    {
-                        for(auto i_x = 0; i_x < X; i_x++)
-                        {
-                            uint32_t v                      = PACK_32(i_k, i_c, i_y, i_x);
-                            wei_k_c_y_x(i_k, i_c, i_y, i_x) = *reinterpret_cast<float*>(&v);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
-        }
-
-        DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
-                                          AVX2_DATA_ALIGNMENT);
-        DeviceAlignedMemCPU wei_device_buf(
-            sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(), AVX2_DATA_ALIGNMENT);
-        DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
-                                               out_n_k_ho_wo_host_result.mDesc.GetElementSpace(),
-                                           AVX2_DATA_ALIGNMENT);
-
-        in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
-        wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-#endif
-#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
-        transpose_kyxc_2_kyxc8k(wei_k_c_y_x_k8, wei_k_c_y_x, K, Y, X, C);
-        wei_device_buf.ToDevice(wei_k_c_y_x_k8.mData.data());
-#endif
-        // get host result
-        {
-            auto ref_conv    = ReferenceConvFwdInstance{};
-            auto ref_invoker = ref_conv.MakeInvoker();
-
-            auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
-                                                      wei_k_c_y_x,
-                                                      out_n_k_ho_wo_host_result,
-                                                      conv_filter_strides,
-                                                      conv_filter_dilations,
-                                                      input_left_pads,
-                                                      input_right_pads,
-                                                      InElementOp{},
-                                                      WeiElementOp{},
-                                                      OutElementOp{});
-            ref_invoker.Run(ref_argument);
-        }
-
-        using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
-        using Relu        = ck::tensor_operation::cpu::element_wise::Relu;
-#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
-        using DeviceConvFwdNoOpPtr = ck::tensor_operation::cpu::device::
-            DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
-#endif
-#if TEST_FUSION == TEST_FUSION_RELU
-        using DeviceConvFwdNoOpPtr =
-            ck::tensor_operation::cpu::device::DeviceConvFwdPtr<PassThrough, PassThrough, Relu>;
-#endif
-
-        // add device Conv instances
-        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
-
-        if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
-                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
-                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
-        {
-#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
-#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
-            if(omp_get_max_threads() > 1)
-            {
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(conv_ptrs);
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
-            }
-            else
-            {
-                if(K % 8 == 0)
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
-                else
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(conv_ptrs);
-            }
-#endif
-#if TEST_FUSION == TEST_FUSION_RELU
-            if(omp_get_max_threads() > 1)
-            {
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu(conv_ptrs);
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(conv_ptrs);
-            }
-            else
-            {
-                if(K % 8 == 0)
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(conv_ptrs);
-                else
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(conv_ptrs);
-            }
-#endif
-#endif
-#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
-#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
-            if(omp_get_max_threads() > 1)
-            {
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(conv_ptrs);
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(conv_ptrs);
-            }
-            else
-            {
-                if(K % 8 == 0)
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(conv_ptrs);
-                else
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(conv_ptrs);
-            }
-#endif
-#if TEST_FUSION == TEST_FUSION_RELU
-            if(omp_get_max_threads() > 1)
-            {
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu(conv_ptrs);
-                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(conv_ptrs);
-            }
-            else
-            {
-                if(K % 8 == 0)
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(conv_ptrs);
-                else
-                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(conv_ptrs);
-            }
-#endif
-#endif
-        }
-
-        if(conv_ptrs.size() <= 0)
-        {
-            throw std::runtime_error("wrong! no device Conv instance found");
-        }
-
-        // profile device Conv instances
-        bool success                    = true;
-        double fastest_kernel_time      = std::numeric_limits<double>::max();
-        std::string fastest_kernel_name = "";
-        double fastest_kernel_gflops    = 0;
-        for(auto& conv_ptr : conv_ptrs)
-        {
-            auto argument_ptr = conv_ptr->MakeArgumentPointer(
-                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                N,
-                K,
-                C,
-                input_spatial_lengths,
-                filter_spatial_lengths,
-                output_spatial_lengths,
-                conv_filter_strides,
-                conv_filter_dilations,
-                input_left_pads,
-                input_right_pads,
-                InElementOp{},
-                WeiElementOp{},
-                OutElementOp{});
-
-            if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                auto invoker_ptr = conv_ptr->MakeInvokerPointer();
-                double time      = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, 10);
-
-                double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X;
-
-                double gflops = (total_flop * 1e-6) / time;
-
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-
-                if(!check_out(out_n_k_ho_wo_host_result,
-                              out_n_k_ho_wo_device_result,
-                              1e-6,
-                              per_pixel_check))
-                {
-                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-                    success = false;
-                }
-                else
-                {
-                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << ", Time:" << time
-                              << "ms, Gflops:" << gflops << std::endl;
-
-                    if(time < fastest_kernel_time)
-                    {
-                        fastest_kernel_time   = time;
-                        fastest_kernel_name   = conv_ptr->GetTypeString();
-                        fastest_kernel_gflops = gflops;
-                    }
-                }
-            }
-            else
-            {
-                std::cout << "Not support Info: " << conv_ptr->GetTypeString() << std::endl;
-            }
-        }
-
-        if(fastest_kernel_time != std::numeric_limits<double>::max())
-        {
-            std::cout << "  fastest:" << fastest_kernel_name << ", time:" << fastest_kernel_time
-                      << "ms, Gflops:" << fastest_kernel_gflops << std::endl;
-        }
-        return 0;
-        // if(success)
-        // {
-        //     std::cout << "test conv2d fwd cpu : Pass" << std::endl;
-        //     return 0;
-        // }
-        // else
-        // {
-        //     std::cout << "test conv2d fwd cpu: Fail " << std::endl;
-        //     return -1;
-        // }
-    };
-
-    if(data_type == 0)
-    {
-        return Run(F32(), F32(), F32());
-    }
-    else
-    {
-        return 1;
-    }
-}
+#include <sstream>
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "reference_conv_fwd.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "dynamic_buffer_cpu.hpp"
+#include <omp.h>
+
+#define AVX2_DATA_ALIGNMENT 32
+
+#define TEST_FUSION_PASSTHROUGH 0
+#define TEST_FUSION_RELU 1
+#define TEST_FUSION TEST_FUSION_PASSTHROUGH
+
+#define TEST_LAYOUT_NHWC_KYXC_NHWK 0
+#define TEST_LAYOUT_NHWC_KYXCK8_NHWK 1
+#define TEST_LAYOUT TEST_LAYOUT_NHWC_KYXCK8_NHWK
+
+using F32 = float;
+using F16 = ck::half_t;
+
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+
+using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+using Relu        = ck::tensor_operation::cpu::element_wise::Relu;
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, Relu>>& instances);
+
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+
+using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
+using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+#endif
+#if TEST_FUSION == TEST_FUSION_RELU
+using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
+#endif
+
+template <typename T>
+static bool
+check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
+{
+    int error_count = 0;
+    float max_diff  = 1e-5;
+
+    double square_difference = .0;
+    double mag1              = .0;
+    double mag2              = .0;
+
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        double ri = (double)ref.mData[i];
+        double pi = (double)result.mData[i];
+        double d  = ri - pi;
+
+        if(per_pixel_check)
+        {
+            if(max_diff < std::abs(d))
+            {
+                error_count++;
+                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
+                       i,
+                       double(ref.mData[i]),
+                       double(result.mData[i]),
+                       d);
+            }
+        }
+
+        square_difference += d * d;
+        if(std::abs(mag1) < std::abs(ri))
+            mag1 = ri;
+        if(std::abs(mag2) < std::abs(pi))
+            mag2 = pi;
+    }
+
+    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
+    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
+
+    if(computed_nrms >= nrms)
+        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
+               computed_nrms,
+               mag1,
+               mag2,
+               nrms);
+
+    return computed_nrms < nrms && error_count == 0;
+}
+
+float calculate_gflops() {}
+
+template <typename T>
+void transpose_kyxc_2_kyxc8k(Tensor<T>& dst,
+                             const Tensor<T>& src,
+                             ck::index_t K,
+                             ck::index_t Y,
+                             ck::index_t X,
+                             ck::index_t C)
+{
+    ck::index_t batch = K / 8;
+    ck::index_t row   = 8;
+    ck::index_t col   = C * Y * X;
+    for(auto i_b = 0; i_b < batch; i_b++)
+    {
+        for(auto i_r = 0; i_r < row; i_r++)
+        {
+            for(auto i_c = 0; i_c < col; i_c++)
+            {
+                ck::index_t src_idx = i_b * row * col + i_r * col + i_c;
+                ck::index_t dst_idx = i_b * col * row + i_c * row + i_r;
+                dst.mData[dst_idx]  = src.mData[src_idx];
+            }
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int data_type   = 0;
+    int init_method = 0;
+
+    // Conv shape
+    ck::index_t N               = 2;
+    ck::index_t K               = 256;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 1;
+    ck::index_t conv_stride_w   = 1;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 1)
+    {
+        data_type   = 0;
+        init_method = 1;
+    }
+    else if(argc == 3)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+    }
+    else if(argc == 18)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+
+        N               = std::stoi(argv[3]);
+        K               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        conv_stride_h   = std::stoi(argv[10]);
+        conv_stride_w   = std::stoi(argv[11]);
+        conv_dilation_h = std::stoi(argv[12]);
+        conv_dilation_w = std::stoi(argv[13]);
+        in_left_pad_h   = std::stoi(argv[14]);
+        in_left_pad_w   = std::stoi(argv[15]);
+        in_right_pad_h  = std::stoi(argv[16]);
+        in_right_pad_w  = std::stoi(argv[17]);
+    }
+    else
+    {
+        printf("arg1: data type (0=fp32, 1=fp16)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    auto Run = [&](auto input_type, auto wei_type, auto out_type) {
+        using InDataType  = decltype(input_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                                      WeiDataType,
+                                                                                      OutDataType,
+                                                                                      InElementOp,
+                                                                                      WeiElementOp,
+                                                                                      OutElementOp>;
+
+        const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+        const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+        const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+        const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+        const std::vector<ck::index_t> input_spatial_lengths{{Hi, Wi}};
+        const std::vector<ck::index_t> filter_spatial_lengths{{Y, X}};
+        const std::vector<ck::index_t> output_spatial_lengths{{Ho, Wo}};
+        const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+        const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+        const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+        const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+
+        auto f_host_tensor_descriptor = [](std::size_t N_,
+                                           std::size_t C_,
+                                           std::size_t H_,
+                                           std::size_t W_) {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H_, W_}),
+                                        std::vector<std::size_t>({C_ * H_ * W_, 1, W_ * C_, C_}));
+        };
+
+        Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+        Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
+#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
+        Tensor<WeiDataType> wei_k_c_y_x_k8(
+            f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
+#endif
+        Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
+        Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
+
+        std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
+        std::cout << "wei(K, C,  Y,  X): " << wei_k_c_y_x.mDesc << std::endl;
+        std::cout << "out(N, K, Ho, Wo): " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+        std::cout << "LPad(H, W):" << in_left_pad_h << "," << in_left_pad_w
+                  << ", RPad(H, W):" << in_right_pad_h << "," << in_right_pad_w
+                  << ", Stride(H, W):" << conv_stride_h << ", " << conv_stride_w
+                  << ", Dilation(H, W):" << conv_dilation_h << ", " << conv_dilation_w
+                  << ", Threads:" << omp_get_max_threads() << std::endl;
+
+        int per_pixel_check = 0;
+        switch(init_method)
+        {
+        case 0:
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
+            break;
+        case 1:
+
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+            // in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+            // wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
+            break;
+
+        case 2:
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+            break;
+
+        case 3:
+
+#define PACK_32(v24, v16, v8, v0) \
+    (((v24 & 0xff) << 24) | ((v16 & 0xff) << 16) | ((v8 & 0xff) << 8) | ((v0 & 0xff) << 0))
+
+            for(auto i_n = 0; i_n < N; i_n++)
+            {
+                for(auto i_c = 0; i_c < C; i_c++)
+                {
+                    for(auto i_hi = 0; i_hi < Hi; i_hi++)
+                    {
+                        for(auto i_wi = 0; i_wi < Wi; i_wi++)
+                        {
+                            uint32_t v                         = PACK_32(i_n, i_c, i_hi, i_wi);
+                            in_n_c_hi_wi(i_n, i_c, i_hi, i_wi) = *reinterpret_cast<float*>(&v);
+                        }
+                    }
+                }
+            }
+
+            for(auto i_k = 0; i_k < K; i_k++)
+            {
+                for(auto i_c = 0; i_c < C; i_c++)
+                {
+                    for(auto i_y = 0; i_y < Y; i_y++)
+                    {
+                        for(auto i_x = 0; i_x < X; i_x++)
+                        {
+                            uint32_t v                      = PACK_32(i_k, i_c, i_y, i_x);
+                            wei_k_c_y_x(i_k, i_c, i_y, i_x) = *reinterpret_cast<float*>(&v);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
+        }
+
+        DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
+                                          AVX2_DATA_ALIGNMENT);
+        DeviceAlignedMemCPU wei_device_buf(
+            sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(), AVX2_DATA_ALIGNMENT);
+        DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
+                                               out_n_k_ho_wo_host_result.mDesc.GetElementSpace(),
+                                           AVX2_DATA_ALIGNMENT);
+
+        in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
+        wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+#endif
+#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
+        transpose_kyxc_2_kyxc8k(wei_k_c_y_x_k8, wei_k_c_y_x, K, Y, X, C);
+        wei_device_buf.ToDevice(wei_k_c_y_x_k8.mData.data());
+#endif
+        // get host result
+        {
+            auto ref_conv    = ReferenceConvFwdInstance{};
+            auto ref_invoker = ref_conv.MakeInvoker();
+
+            auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                      wei_k_c_y_x,
+                                                      out_n_k_ho_wo_host_result,
+                                                      conv_filter_strides,
+                                                      conv_filter_dilations,
+                                                      input_left_pads,
+                                                      input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+        }
+
+        using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+        using Relu        = ck::tensor_operation::cpu::element_wise::Relu;
+#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
+        using DeviceConvFwdNoOpPtr = ck::tensor_operation::cpu::device::
+            DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
+#endif
+#if TEST_FUSION == TEST_FUSION_RELU
+        using DeviceConvFwdNoOpPtr =
+            ck::tensor_operation::cpu::device::DeviceConvFwdPtr<PassThrough, PassThrough, Relu>;
+#endif
+
+        // add device Conv instances
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+
+        if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+        {
+#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXC_NHWK
+#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
+            if(omp_get_max_threads() > 1)
+            {
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(conv_ptrs);
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+            }
+            else
+            {
+                if(K % 8 == 0)
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+                else
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(conv_ptrs);
+            }
+#endif
+#if TEST_FUSION == TEST_FUSION_RELU
+            if(omp_get_max_threads() > 1)
+            {
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt_relu(conv_ptrs);
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(conv_ptrs);
+            }
+            else
+            {
+                if(K % 8 == 0)
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_relu(conv_ptrs);
+                else
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c_relu(conv_ptrs);
+            }
+#endif
+#endif
+#if TEST_LAYOUT == TEST_LAYOUT_NHWC_KYXCK8_NHWK
+#if TEST_FUSION == TEST_FUSION_PASSTHROUGH
+            if(omp_get_max_threads() > 1)
+            {
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt(conv_ptrs);
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(conv_ptrs);
+            }
+            else
+            {
+                if(K % 8 == 0)
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk(conv_ptrs);
+                else
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c(conv_ptrs);
+            }
+#endif
+#if TEST_FUSION == TEST_FUSION_RELU
+            if(omp_get_max_threads() > 1)
+            {
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_mt_relu(conv_ptrs);
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(conv_ptrs);
+            }
+            else
+            {
+                if(K % 8 == 0)
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_relu(conv_ptrs);
+                else
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxck8_nhwk_local_c_relu(conv_ptrs);
+            }
+#endif
+#endif
+        }
+
+        if(conv_ptrs.size() <= 0)
+        {
+            throw std::runtime_error("wrong! no device Conv instance found");
+        }
+
+        // profile device Conv instances
+        bool success                    = true;
+        double fastest_kernel_time      = std::numeric_limits<double>::max();
+        std::string fastest_kernel_name = "";
+        double fastest_kernel_gflops    = 0;
+        for(auto& conv_ptr : conv_ptrs)
+        {
+            auto argument_ptr = conv_ptr->MakeArgumentPointer(
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                N,
+                K,
+                C,
+                input_spatial_lengths,
+                filter_spatial_lengths,
+                output_spatial_lengths,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                InElementOp{},
+                WeiElementOp{},
+                OutElementOp{});
+
+            if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+                double time      = invoker_ptr->Run(argument_ptr.get(), StreamConfig{}, 10);
+
+                double total_flop = static_cast<double>(2) * N * C * Ho * Wo * K * Y * X;
+
+                double gflops = (total_flop * 1e-6) / time;
+
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+
+                if(!check_out(out_n_k_ho_wo_host_result,
+                              out_n_k_ho_wo_device_result,
+                              1e-6,
+                              per_pixel_check))
+                {
+                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
+                    success = false;
+                }
+                else
+                {
+                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << ", Time:" << time
+                              << "ms, Gflops:" << gflops << std::endl;
+
+                    if(time < fastest_kernel_time)
+                    {
+                        fastest_kernel_time   = time;
+                        fastest_kernel_name   = conv_ptr->GetTypeString();
+                        fastest_kernel_gflops = gflops;
+                    }
+                }
+            }
+            else
+            {
+                std::cout << "Not support Info: " << conv_ptr->GetTypeString() << std::endl;
+            }
+        }
+
+        if(fastest_kernel_time != std::numeric_limits<double>::max())
+        {
+            std::cout << "  fastest:" << fastest_kernel_name << ", time:" << fastest_kernel_time
+                      << "ms, Gflops:" << fastest_kernel_gflops << std::endl;
+        }
+        return 0;
+        // if(success)
+        // {
+        //     std::cout << "test conv2d fwd cpu : Pass" << std::endl;
+        //     return 0;
+        // }
+        // else
+        // {
+        //     std::cout << "test conv2d fwd cpu: Fail " << std::endl;
+        //     return -1;
+        // }
+    };
+
+    if(data_type == 0)
+    {
+        return Run(F32(), F32(), F32());
+    }
+    else
+    {
+        return 1;
+    }
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -69,6 +69,4 @@ add_subdirectory(reduce)
 add_subdirectory(conv2d_bwd_weight)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(cpu_ukernel)
-add_subdirectory(cpu_threadwise_transfer)
-add_subdirectory(convnd_fwd_cpu)
 # DONOT add client_app, that is tested via CI independently
--- a/test/convnd_fwd_cpu/CMakeLists.txt
+++ b/test/convnd_fwd_cpu/CMakeLists.txt
-add_test_executable(test_conv2d_fwd_cpu conv2d_fwd_cpu.cpp)
-target_link_libraries(test_conv2d_fwd_cpu PRIVATE host_tensor)
-target_link_libraries(test_conv2d_fwd_cpu PRIVATE device_conv2d_fwd_cpu_instance)
-# 3.13 introduce target_link_directories, which is better
-set_target_properties(test_conv2d_fwd_cpu PROPERTIES LINK_FLAGS "${OMP_LINK_FLAG}")
-target_link_libraries(test_conv2d_fwd_cpu PRIVATE "${OMP_LIBRARY}")
-target_compile_options(test_conv2d_fwd_cpu PRIVATE "${OMP_CXX_FLAG}")
--- a/test/cpu_threadwise_transfer/CMakeLists.txt
+++ b/test/cpu_threadwise_transfer/CMakeLists.txt
-add_test_executable(test_cpu_threadwise_transfer cpu_threadwise_transfer.cpp)
-target_link_libraries(test_cpu_threadwise_transfer PRIVATE host_tensor)
-# 3.13 introduce target_link_directories, which is better
-set_target_properties(test_cpu_threadwise_transfer PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
-target_link_libraries(test_cpu_threadwise_transfer PRIVATE /opt/rocm/llvm/lib/libomp.so)
-target_compile_options(test_cpu_threadwise_transfer PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
--- a/test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
+++ b/test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
-#include <iostream>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <string>
-#include <sstream>
-#include <tuple>
-#include <memory>
-#include <half.hpp>
-#include <omp.h>
-#include "host_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "device.hpp"
-#include "config.hpp"
-#include "print.hpp"
-#include "cpuid.hpp"
-#include "threadwise_tensor_slice_transfer_avx2.hpp"
-#include "element_wise_operation_cpu.hpp"
-#include "dynamic_buffer_cpu.hpp"
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-// using AType = half_float::half;
-// using BType = half_float::half;
-using AType = float;
-using BType = float;
-using CType = float;
-#define NTStore false
-
-using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
-
-static inline int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride)
-{
-    return (in_size + 2 * pad - dilation * (ksize - 1) - 1) / stride + 1;
-}
-
-#define MC 16
-#define NC 24
-#define KC 32
-
-#define IsInputPadded true
-#define IsInputCBlockTranspose false
-#define CBlockMVector 8
-
-template <typename T>
-static inline void dump_memory(T* ptr, ck::index_t elem)
-{
-    for(ck::index_t i = 0; i < elem; i++)
-    {
-        std::cout << i << ": 0x" << std::hex << ptr[i] << std::dec << std::endl;
-    }
-}
-
-int main(int argc, char** argv)
-{
-    int n  = 2;
-    int hi = 8;
-    int wi = 6;
-    int c  = 8;
-    int fy = 3;
-    int fx = 3;
-    int dy = 1;
-    int dx = 1;
-    int sy = 1;
-    int sx = 1;
-    int py = 0;
-    int px = 0;
-
-    if(argc > 12)
-    {
-        n  = std::atoi(argv[1]);
-        hi = std::atoi(argv[2]);
-        wi = std::atoi(argv[3]);
-        c  = std::atoi(argv[4]);
-        fy = std::atoi(argv[5]);
-        fx = std::atoi(argv[6]);
-        dy = std::atoi(argv[7]);
-        dx = std::atoi(argv[8]);
-        sy = std::atoi(argv[9]);
-        sx = std::atoi(argv[10]);
-        py = std::atoi(argv[11]);
-        px = std::atoi(argv[12]);
-    }
-
-    int ho = conv_out_size(hi, py, dy, fy, sy);
-    int wo = conv_out_size(wi, px, dx, fx, sx);
-
-    DeviceAlignedMemCPU input_mem(n * c * hi * wi * sizeof(AType), 32);
-    DeviceAlignedMemCPU input_cblock_mem(MC * KC * sizeof(AType), 32);
-
-    auto gen_input_buffer =
-        [&](AType* ptr, ck::index_t N, ck::index_t Hi, ck::index_t Wi, ck::index_t C) {
-            for(auto i_n = 0; i_n < N; i_n++)
-            {
-                for(auto i_hi = 0; i_hi < Hi; i_hi++)
-                {
-                    for(auto i_wi = 0; i_wi < Wi; i_wi++)
-                    {
-                        for(auto i_c = 0; i_c < C; i_c++)
-                        {
-                            auto index = i_n * Hi * Wi * C + i_hi * Wi * C + i_wi * C + i_c;
-                            auto value = ((i_n & 0xff) << 24) | ((i_hi & 0xff) << 16) |
-                                         ((i_wi & 0xff) << 8) | ((i_c & 0xff) << 0);
-                            ptr[index] = *reinterpret_cast<AType*>(&value);
-                        }
-                    }
-                }
-            }
-        };
-
-    gen_input_buffer(reinterpret_cast<AType*>(input_mem.mpDeviceBuf), n, hi, wi, c);
-
-    const auto input_desc = [&]() {
-        const auto in_n_hi_wi_c_grid_desc =
-            ck::make_naive_tensor_descriptor_packed(ck::make_tuple(n, hi, wi, c));
-
-        const auto in_n_hip_wip_c_grid_desc = ck::transform_tensor_descriptor(
-            in_n_hi_wi_c_grid_desc,
-            ck::make_tuple(ck::make_pass_through_transform(n),
-                           ck::make_pad_transform(hi, py, py),
-                           ck::make_pad_transform(wi, px, px),
-                           ck::make_pass_through_transform(c)),
-            ck::make_tuple(
-                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
-            ck::make_tuple(
-                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}));
-
-        const auto in_n_y_ho_x_wo_c_grid_desc = ck::transform_tensor_descriptor(
-            in_n_hip_wip_c_grid_desc,
-            ck::make_tuple(ck::make_pass_through_transform(n),
-                           ck::make_embed_transform(ck::make_tuple(fy, ho), ck::make_tuple(dy, sy)),
-                           ck::make_embed_transform(ck::make_tuple(fx, wo), ck::make_tuple(dx, sx)),
-                           ck::make_pass_through_transform(c)),
-            ck::make_tuple(
-                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
-            ck::make_tuple(
-                ck::Sequence<0>{}, ck::Sequence<1, 2>{}, ck::Sequence<3, 4>{}, ck::Sequence<5>{}));
-
-        const auto in_gemm_m_k_grid_desc = ck::transform_tensor_descriptor(
-            in_n_y_ho_x_wo_c_grid_desc,
-            ck::make_tuple(ck::make_merge_transform(ck::make_tuple(n, ho, wo)),
-                           ck::make_merge_transform(ck::make_tuple(fy, fx, c))),
-            ck::make_tuple(ck::Sequence<0, 2, 4>{}, ck::Sequence<1, 3, 5>{}),
-            ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-
-        if constexpr(IsInputPadded)
-        {
-            const auto gemm_m_raw = n * ho * wo;
-            const auto gemm_m_pad = ck::math::integer_least_multiple(gemm_m_raw, MC) - gemm_m_raw;
-            const auto gemm_k_raw = c * fy * fx;
-            const auto gemm_k_pad = ck::math::integer_least_multiple(gemm_k_raw, KC) - gemm_k_raw;
-
-            const auto in_gemm_pm_pk_grid_desc = ck::transform_tensor_descriptor(
-                in_gemm_m_k_grid_desc,
-                ck::make_tuple(ck::make_right_pad_transform(gemm_m_raw, gemm_m_pad),
-                               ck::make_right_pad_transform(gemm_k_raw, gemm_k_pad)),
-                ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
-                ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-
-            if constexpr(IsInputCBlockTranspose)
-            {
-                constexpr auto I0             = ck::Number<0>{};
-                constexpr auto I1             = ck::Number<1>{};
-                const auto in_gemm_pm0_pk_pm1 = ck::transform_tensor_descriptor(
-                    in_gemm_pm_pk_grid_desc,
-                    ck::make_tuple(
-                        ck::make_unmerge_transform(ck::make_tuple(
-                            in_gemm_pm_pk_grid_desc.GetLength(I0) / CBlockMVector, CBlockMVector)),
-                        ck::make_pass_through_transform(in_gemm_pm_pk_grid_desc.GetLength(I1))),
-                    ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
-                    ck::make_tuple(ck::Sequence<0, 2>{}, ck::Sequence<1>{}));
-                return in_gemm_pm0_pk_pm1;
-            }
-            else
-                return in_gemm_pm_pk_grid_desc;
-        }
-        else
-        {
-            return in_gemm_m_k_grid_desc;
-        }
-    }();
-
-    const auto input_cblock_desc = [&]() {
-        if constexpr(IsInputCBlockTranspose)
-        {
-            const auto in_cblock_m_k_m8 = ck::make_naive_tensor_descriptor_packed(
-                ck::make_tuple(MC / CBlockMVector, KC, CBlockMVector));
-            return in_cblock_m_k_m8;
-        }
-        else
-        {
-            return ck::make_naive_tensor_descriptor_packed(ck::make_tuple(MC, KC));
-        }
-    }();
-
-    constexpr auto get_dim_access_order = []() {
-        if constexpr(IsInputCBlockTranspose)
-            return ck::Sequence<1, 0, 2>{};
-        else
-            return ck::Sequence<0, 1>{};
-    };
-
-    constexpr auto get_slice_length = []() {
-        if constexpr(IsInputCBlockTranspose)
-            return ck::Sequence<MC / CBlockMVector, KC, CBlockMVector>{};
-        else
-            return ck::Sequence<MC, KC>{};
-    };
-
-    using threadwise_transfer_t = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
-        AType,                              // SrcData
-        AType,                              // DstData
-        decltype(input_desc),               // SrcDesc
-        decltype(input_cblock_desc),        // DstDesc
-        PassThrough,                        // ElementwiseOperation
-        decltype(get_slice_length()),       // SliceLengths
-        decltype(get_dim_access_order()),   // DimAccessOrder
-        1,                                  // VectorDim
-        1,                                  // ScalarPerVector
-        ck::InMemoryDataOperationEnum::Set, // InMemoryDataOperationEnum
-        false,                              // SrcResetCoordinateAfterRun
-        true                                // DstResetCoordinateAfterRun
-        >;
-
-    static constexpr ck::index_t nDim =
-        ck::remove_reference_t<decltype(input_desc)>::GetNumOfDimension();
-
-    input_desc.Print();
-
-    auto threadwise_transfer = threadwise_transfer_t{input_desc,
-                                                     ck::make_zero_multi_index<nDim>(),
-                                                     input_cblock_desc,
-                                                     ck::make_zero_multi_index<nDim>(),
-                                                     PassThrough{}};
-
-    auto input_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
-        static_cast<AType*>(input_mem.mpDeviceBuf), input_mem.mMemSize / sizeof(AType));
-
-    auto input_cblock = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
-        static_cast<AType*>(input_cblock_mem.mpDeviceBuf),
-        input_cblock_mem.mMemSize / sizeof(AType));
-
-    constexpr auto fwd_move_step = []() {
-        if constexpr(IsInputCBlockTranspose)
-            return ck::make_multi_index(0, KC, 0); // m/8 * k * 8
-        else
-            return ck::make_multi_index(0, KC);
-    };
-
-    threadwise_transfer.RunGeneric(input_desc, input_buf, input_cblock_desc, input_cblock);
-
-    printf("----------------------\n");
-
-    threadwise_transfer.MoveSrcSliceWindow(input_desc, fwd_move_step());
-
-    // threadwise_transfer.RunGeneric(input_desc,  input_buf , input_cblock_desc, input_cblock);
-
-    dump_memory(reinterpret_cast<uint32_t*>(input_mem.mpDeviceBuf),
-                input_mem.mMemSize / sizeof(AType));
-    std::cout << "======================" << std::endl;
-    dump_memory(reinterpret_cast<uint32_t*>(input_cblock_mem.mpDeviceBuf),
-                input_cblock_mem.mMemSize / sizeof(AType));
-}