add test threadwise transfer. currently static_ford in threadwise transfer can...

add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size

add test threadwise transfer. currently static_ford in threadwise transfer can...
add test threadwise transfer. currently static_ford in threadwise transfer can not support large MC*KC tile size
c0f698d5 · carlushuang · e6ee6594 · c0f698d5 · c0f698d5 · c0f698d5
Commit c0f698d5 authored Apr 14, 2022 by carlushuang
9 changed files
--- a/profiler/src/profile_conv_fwd_cpu.cpp
+++ b/profiler/src/profile_conv_fwd_cpu.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_cpu_conv_fwd_cpu_impl.hpp"
+enum struct ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+enum struct ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum struct ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum struct ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_cpu_conv_fwd(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_cpu: ForwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto in_layout       = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const auto wei_layout      = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const auto out_layout      = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_cpu_impl<2,
+                                                float,
+                                                float,
+                                                float,
+                                                ck::tensor_layout::convolution::NHWC,
+                                                ck::tensor_layout::convolution::KYXC,
+                                                ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
+    }
+    return 1;
+}
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -69,6 +69,10 @@ int main(int argc, char* argv[])
    {
        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_fwd_cpu") == 0)
+    {
+        return profile_conv_fwd_cpu(argc, argv);
+    }
    else if(strcmp(argv[1], "conv1d_bwd_data") == 0)
    {
        return profile_convnd_bwd_data(argc, argv, 1);
@@ -98,6 +102,7 @@ int main(int argc, char* argv[])
               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
+               "                        conv_fwd_cpu: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"

--- a/script/cmake-avx2.sh
+++ b/script/cmake-avx2.sh
+#!/bin/bash
+rm -f CMakeCache.txt
+rm -f *.cmake
+rm -rf CMakeFiles
+AVX2_FLAGS='-m64 -mavx2 -mf16c -mfma -DHALF_ENABLE_F16C_INTRINSICS=0'
+rm -rf build/
+mkdir build && cd build
+MY_PROJECT_SOURCE=..
+MY_PROJECT_INSTALL=../install.dir
+rm -rf $MY_PROJECT_INSTALL
+mkdir $MY_PROJECT_INSTALL
+cmake                                                                                                                                          \
+-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
+-D BUILD_DEV=OFF                                                                                                                               \
+-D CMAKE_BUILD_TYPE=Release                                                                                                                    \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only $AVX2_FLAGS "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
+${MY_PROJECT_SOURCE}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,7 +10,11 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/block
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/cpu/element
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
@@ -32,6 +36,7 @@ function(add_test_executable TEST_NAME)
    add_dependencies(check ${TEST_NAME})
 endfunction(add_test_executable TEST_NAME)
 add_subdirectory(magic_number_division)
 add_subdirectory(space_filling_curve)
 add_subdirectory(conv_util)
@@ -45,3 +50,5 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(reduce)
 add_subdirectory(cpu_ukernel)
+add_subdirectory(cpu_threadwise_transfer)
+add_subdirectory(convnd_fwd_cpu)
--- a/test/convnd_fwd_cpu/CMakeLists.txt
+++ b/test/convnd_fwd_cpu/CMakeLists.txt
+add_test_executable(test_conv2d_fwd_cpu conv2d_fwd_cpu.cpp)
+target_link_libraries(test_conv2d_fwd_cpu PRIVATE host_tensor)
+target_link_libraries(test_conv2d_fwd_cpu PRIVATE device_conv2d_fwd_cpu_instance)
+# 3.13 introduce target_link_directories, which is better
+set_target_properties(test_conv2d_fwd_cpu PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
+target_link_libraries(test_conv2d_fwd_cpu PRIVATE /opt/rocm/llvm/lib/libomp.so)
+target_compile_options(test_conv2d_fwd_cpu PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
--- a/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+++ b/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "reference_conv_fwd.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "dynamic_buffer_cpu.hpp"
+#define AVX2_DATA_ALIGNMENT 32
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+template <typename T>
+static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float max_diff = 1e-6;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+int main(int argc, char* argv[])
+{
+    int data_type   = 0;
+    int init_method = 0;
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    if(argc == 1)
+    {
+        data_type   = 1;
+        init_method = 1;
+    }
+    else if(argc == 3)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+    }
+    else if(argc == 18)
+    {
+        data_type   = std::stoi(argv[1]);
+        init_method = std::stoi(argv[2]);
+        N               = std::stoi(argv[3]);
+        K               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        conv_stride_h   = std::stoi(argv[10]);
+        conv_stride_w   = std::stoi(argv[11]);
+        conv_dilation_h = std::stoi(argv[12]);
+        conv_dilation_w = std::stoi(argv[13]);
+        in_left_pad_h   = std::stoi(argv[14]);
+        in_left_pad_w   = std::stoi(argv[15]);
+        in_right_pad_h  = std::stoi(argv[16]);
+        in_right_pad_w  = std::stoi(argv[17]);
+    }
+    else
+    {
+        printf("arg1: data type (0=fp32, 1=fp16)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    auto Run = [&](auto input_type, auto wei_type, auto out_type, auto acc_type) {
+        using InDataType  = decltype(input_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+        using AccDataType = decltype(acc_type);
+        using ReferenceConvBwdInstance =
+            ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             AccDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp>;
+        const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+        const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+        const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+        const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const std::vector<ck::index_t> input_spatial_lengths{{Hi, Wi}};
+        const std::vector<ck::index_t> filter_spatial_lengths{{Y, X}};
+        const std::vector<ck::index_t> output_spatial_lengths{{Ho, Wo}};
+        const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+        const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+        const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+        const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+        auto f_host_tensor_descriptor = [](std::size_t N_,
+                                           std::size_t C_,
+                                           std::size_t H_,
+                                           std::size_t W_) {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H_, W_}),
+                                        std::vector<std::size_t>({C_ * H_ * W_, 1, W_ * C_, C_}));
+        };
+        Tensor<OutDataType> out_n_ho_wo_k(f_host_tensor_descriptor(N, K, Ho, Wo));
+        Tensor<WeiDataType> wei_k_y_x_c(f_host_tensor_descriptor(K, C, Y, X));
+        Tensor<InDataType> in_n_hi_wi_c_host_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+        Tensor<InDataType> in_n_hi_wi_c_device_result(f_host_tensor_descriptor(N, C, Hi, Wi));
+        std::cout << "in (N, C, Hi, Wi): " << in_n_hi_wi_c_host_result.mDesc << std::endl;
+        std::cout << "wei(K, C,  Y,  X): " << wei_k_y_x_c.mDesc << std::endl;
+        std::cout << "out(N, K, Ho, Wo): " << out_n_ho_wo_k.mDesc << std::endl;
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            out_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+            wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+            break;
+        case 2:
+            out_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+            wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+            break;
+        default:
+            out_n_ho_wo_k.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+            wei_k_y_x_c.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+        }
+        DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) *
+                                              in_n_hi_wi_c_device_result.mDesc.GetElementSpace(),
+                                          AVX2_DATA_ALIGNMENT);
+        DeviceAlignedMemCPU wei_device_buf(
+            sizeof(WeiDataType) * wei_k_y_x_c.mDesc.GetElementSpace(), AVX2_DATA_ALIGNMENT);
+        DeviceAlignedMemCPU out_device_buf(
+            sizeof(OutDataType) * out_n_ho_wo_k.mDesc.GetElementSpace(), AVX2_DATA_ALIGNMENT);
+        out_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+        wei_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+        // reset input to zero
+        in_n_hi_wi_c_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
+        in_device_buf.ToDevice(in_n_hi_wi_c_device_result.mData.data());
+        // get host result
+        {
+            auto ref_conv    = ReferenceConvFwdInstance{};
+            auto ref_invoker = ref_conv.MakeInvoker();
+            auto ref_argument = ref_conv.MakeArgument(in_n_hi_wi_c_host_result,
+                                                      wei_k_y_x_c,
+                                                      out_n_ho_wo_k,
+                                                      conv_filter_strides,
+                                                      conv_filter_dilations,
+                                                      input_left_pads,
+                                                      input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+        }
+        using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+        using DeviceConvFwdNoOpPtr =
+            ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
+        // add device Conv instances
+        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+        if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
+                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
+        {
+            ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+        }
+        if(conv_ptrs.size() <= 0)
+        {
+            throw std::runtime_error("wrong! no device Conv instance found");
+        }
+        // profile device Conv instances
+        bool success = true;
+        for(auto& conv_ptr : conv_ptrs)
+        {
+            auto argument_ptr = conv_ptr->MakeArgumentPointer(
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                N,
+                K,
+                C,
+                input_spatial_lengths,
+                filter_spatial_lengths,
+                output_spatial_lengths,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                InElementOp{},
+                WeiElementOp{},
+                OutElementOp{});
+            if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+                invoker_ptr->Run(argument_ptr.get(), 1);
+                in_device_buf.FromDevice(in_n_hi_wi_c_device_result.mData.data());
+                if(!check_out(in_n_hi_wi_c_host_result, in_n_hi_wi_c_device_result))
+                {
+                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
+                    success = false;
+                }
+                else
+                {
+                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
+                }
+            }
+            else
+            {
+                std::cout << "Not support Info: " << conv_ptr->GetTypeString() << std::endl;
+            }
+        }
+        if(success)
+        {
+            std::cout << "test conv2d fwd cpu : Pass" << std::endl;
+            return 0;
+        }
+        else
+        {
+            std::cout << "test conv2d fwd cpu: Fail " << std::endl;
+            return -1;
+        }
+    };
+    if(data_type == 0)
+    {
+        return Run(F32(), F32(), F32(), F32());
+    }
+    else if(data_type == 1)
+    {
+        return Run(F16(), F16(), F16(), F32());
+    }
+    else
+    {
+        return 1;
+    }
+}
--- a/test/cpu_threadwise_transfer/CMakeLists.txt
+++ b/test/cpu_threadwise_transfer/CMakeLists.txt
+add_test_executable(test_cpu_threadwise_transfer cpu_threadwise_transfer.cpp)
+target_link_libraries(test_cpu_threadwise_transfer PRIVATE host_tensor)
+# 3.13 introduce target_link_directories, which is better
+set_target_properties(test_cpu_threadwise_transfer PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
+target_link_libraries(test_cpu_threadwise_transfer PRIVATE /opt/rocm/llvm/lib/libomp.so)
+target_compile_options(test_cpu_threadwise_transfer PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
--- a/test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
+++ b/test/cpu_threadwise_transfer/cpu_threadwise_transfer.cpp
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <memory>
+#include <half.hpp>
+#include <omp.h>
+#include "host_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "device.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "cpuid.hpp"
+#include "threadwise_tensor_slice_transfer_avx2.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "dynamic_buffer_cpu.hpp"
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+// using AType = half_float::half;
+// using BType = half_float::half;
+using AType = float;
+using BType = float;
+using CType = float;
+#define NTStore false
+using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+static inline int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride)
+{
+    return (in_size + 2 * pad - dilation * (ksize - 1) - 1) / stride + 1;
+}
+#define MC 16
+#define NC 24
+#define KC 32
+#define IsInputPadded true
+#define IsInputCBlockTranspose false
+#define CBlockMVector 8
+template <typename T>
+static inline void dump_memory(T* ptr, ck::index_t elem)
+{
+    for(ck::index_t i = 0; i < elem; i++)
+    {
+        std::cout << i << ": 0x" << std::hex << ptr[i] << std::dec << std::endl;
+    }
+}
+int main(int argc, char** argv)
+{
+    int n  = 2;
+    int hi = 8;
+    int wi = 6;
+    int c  = 8;
+    int fy = 3;
+    int fx = 3;
+    int dy = 1;
+    int dx = 1;
+    int sy = 1;
+    int sx = 1;
+    int py = 0;
+    int px = 0;
+    if(argc > 12)
+    {
+        n  = std::atoi(argv[1]);
+        hi = std::atoi(argv[2]);
+        wi = std::atoi(argv[3]);
+        c  = std::atoi(argv[4]);
+        fy = std::atoi(argv[5]);
+        fx = std::atoi(argv[6]);
+        dy = std::atoi(argv[7]);
+        dx = std::atoi(argv[8]);
+        sy = std::atoi(argv[9]);
+        sx = std::atoi(argv[10]);
+        py = std::atoi(argv[11]);
+        px = std::atoi(argv[12]);
+    }
+    int ho = conv_out_size(hi, py, dy, fy, sy);
+    int wo = conv_out_size(wi, px, dx, fx, sx);
+    DeviceAlignedMemCPU input_mem(n * c * hi * wi * sizeof(AType), 32);
+    DeviceAlignedMemCPU input_cblock_mem(MC * KC * sizeof(AType), 32);
+    auto gen_input_buffer =
+        [&](AType* ptr, ck::index_t N, ck::index_t Hi, ck::index_t Wi, ck::index_t C) {
+            for(auto i_n = 0; i_n < N; i_n++)
+            {
+                for(auto i_hi = 0; i_hi < Hi; i_hi++)
+                {
+                    for(auto i_wi = 0; i_wi < Wi; i_wi++)
+                    {
+                        for(auto i_c = 0; i_c < C; i_c++)
+                        {
+                            auto index = i_n * Hi * Wi * C + i_hi * Wi * C + i_wi * C + i_c;
+                            auto value = ((i_n & 0xff) << 24) | ((i_hi & 0xff) << 16) |
+                                         ((i_wi & 0xff) << 8) | ((i_c & 0xff) << 0);
+                            ptr[index] = *reinterpret_cast<AType*>(&value);
+                        }
+                    }
+                }
+            }
+        };
+    gen_input_buffer(reinterpret_cast<AType*>(input_mem.mpDeviceBuf), n, hi, wi, c);
+    const auto input_desc = [&]() {
+        const auto in_n_hi_wi_c_grid_desc =
+            ck::make_naive_tensor_descriptor_packed(ck::make_tuple(n, hi, wi, c));
+        const auto in_n_hip_wip_c_grid_desc = ck::transform_tensor_descriptor(
+            in_n_hi_wi_c_grid_desc,
+            ck::make_tuple(ck::make_pass_through_transform(n),
+                           ck::make_pad_transform(hi, py, py),
+                           ck::make_pad_transform(wi, px, px),
+                           ck::make_pass_through_transform(c)),
+            ck::make_tuple(
+                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
+            ck::make_tuple(
+                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}));
+        const auto in_n_y_ho_x_wo_c_grid_desc = ck::transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            ck::make_tuple(ck::make_pass_through_transform(n),
+                           ck::make_embed_transform(ck::make_tuple(fy, ho), ck::make_tuple(dy, sy)),
+                           ck::make_embed_transform(ck::make_tuple(fx, wo), ck::make_tuple(dx, sx)),
+                           ck::make_pass_through_transform(c)),
+            ck::make_tuple(
+                ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
+            ck::make_tuple(
+                ck::Sequence<0>{}, ck::Sequence<1, 2>{}, ck::Sequence<3, 4>{}, ck::Sequence<5>{}));
+        const auto in_gemm_m_k_grid_desc = ck::transform_tensor_descriptor(
+            in_n_y_ho_x_wo_c_grid_desc,
+            ck::make_tuple(ck::make_merge_transform(ck::make_tuple(n, ho, wo)),
+                           ck::make_merge_transform(ck::make_tuple(fy, fx, c))),
+            ck::make_tuple(ck::Sequence<0, 2, 4>{}, ck::Sequence<1, 3, 5>{}),
+            ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
+        if constexpr(IsInputPadded)
+        {
+            const auto gemm_m_raw = n * ho * wo;
+            const auto gemm_m_pad = ck::math::integer_least_multiple(gemm_m_raw, MC) - gemm_m_raw;
+            const auto gemm_k_raw = c * fy * fx;
+            const auto gemm_k_pad = ck::math::integer_least_multiple(gemm_k_raw, KC) - gemm_k_raw;
+            const auto in_gemm_pm_pk_grid_desc = ck::transform_tensor_descriptor(
+                in_gemm_m_k_grid_desc,
+                ck::make_tuple(ck::make_right_pad_transform(gemm_m_raw, gemm_m_pad),
+                               ck::make_right_pad_transform(gemm_k_raw, gemm_k_pad)),
+                ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
+                ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
+            if constexpr(IsInputCBlockTranspose)
+            {
+                constexpr auto I0             = ck::Number<0>{};
+                constexpr auto I1             = ck::Number<1>{};
+                const auto in_gemm_pm0_pk_pm1 = ck::transform_tensor_descriptor(
+                    in_gemm_pm_pk_grid_desc,
+                    ck::make_tuple(
+                        ck::make_unmerge_transform(ck::make_tuple(
+                            in_gemm_pm_pk_grid_desc.GetLength(I0) / CBlockMVector, CBlockMVector)),
+                        ck::make_pass_through_transform(in_gemm_pm_pk_grid_desc.GetLength(I1))),
+                    ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}),
+                    ck::make_tuple(ck::Sequence<0, 2>{}, ck::Sequence<1>{}));
+                return in_gemm_pm0_pk_pm1;
+            }
+            else
+                return in_gemm_pm_pk_grid_desc;
+        }
+        else
+        {
+            return in_gemm_m_k_grid_desc;
+        }
+    }();
+    const auto input_cblock_desc = [&]() {
+        if constexpr(IsInputCBlockTranspose)
+        {
+            const auto in_cblock_m_k_m8 = ck::make_naive_tensor_descriptor_packed(
+                ck::make_tuple(MC / CBlockMVector, KC, CBlockMVector));
+            return in_cblock_m_k_m8;
+        }
+        else
+        {
+            return ck::make_naive_tensor_descriptor_packed(ck::make_tuple(MC, KC));
+        }
+    }();
+    constexpr auto get_dim_access_order = []() {
+        if constexpr(IsInputCBlockTranspose)
+            return ck::Sequence<1, 0, 2>{};
+        else
+            return ck::Sequence<0, 1>{};
+    };
+    constexpr auto get_slice_length = []() {
+        if constexpr(IsInputCBlockTranspose)
+            return ck::Sequence<MC / CBlockMVector, KC, CBlockMVector>{};
+        else
+            return ck::Sequence<MC, KC>{};
+    };
+    using threadwise_transfer_t = ck::cpu::ThreadwiseTensorSliceTransferAvx2<
+        AType,                                // SrcData
+        AType,                                // DstData
+        decltype(input_desc),                 // SrcDesc
+        decltype(input_cblock_desc),          // DstDesc
+        PassThrough,                          // ElementwiseOperation
+        decltype(get_slice_length()),         // SliceLengths
+        decltype(get_dim_access_order()),     // DimAccessOrder
+        1,                                    // VectorDim
+        1,                                    // ScalarPerVector
+        ck::InMemoryDataOperationEnum_t::Set, // InMemoryDataOperationEnum_t
+        false,                                // SrcResetCoordinateAfterRun
+        true                                  // DstResetCoordinateAfterRun
+        >;
+    static constexpr ck::index_t nDim =
+        ck::remove_reference_t<decltype(input_desc)>::GetNumOfDimension();
+    auto threadwise_transfer = threadwise_transfer_t{input_desc,
+                                                     ck::make_zero_multi_index<nDim>(),
+                                                     input_cblock_desc,
+                                                     ck::make_zero_multi_index<nDim>(),
+                                                     PassThrough{}};
+    auto input_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+        static_cast<AType*>(input_mem.mpDeviceBuf), input_mem.mMemSize / sizeof(AType));
+    auto input_cblock = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum_t::Global>(
+        static_cast<AType*>(input_cblock_mem.mpDeviceBuf),
+        input_cblock_mem.mMemSize / sizeof(AType));
+    constexpr auto fwd_move_step = []() {
+        if constexpr(IsInputCBlockTranspose)
+            return ck::make_multi_index(0, KC, 0); // m/8 * k * 8
+        else
+            return ck::make_multi_index(0, KC);
+    };
+    threadwise_transfer.RunGeneric(input_desc, input_buf, input_cblock_desc, input_cblock);
+    printf("----------------------\n");
+    threadwise_transfer.MoveSrcSliceWindow(input_desc, fwd_move_step());
+    // threadwise_transfer.RunGeneric(input_desc,  input_buf , input_cblock_desc, input_cblock);
+    dump_memory(reinterpret_cast<uint32_t*>(input_mem.mpDeviceBuf),
+                input_mem.mMemSize / sizeof(AType));
+    std::cout << "======================" << std::endl;
+    dump_memory(reinterpret_cast<uint32_t*>(input_cblock_mem.mpDeviceBuf),
+                input_cblock_mem.mMemSize / sizeof(AType));
+}
--- a/test/cpu_ukernel/cpu_gemm_uk.cpp
+++ b/test/cpu_ukernel/cpu_gemm_uk.cpp
@@ -235,42 +235,42 @@ void test_ukernel(ukenrel_t uk,
    auto invoke_uk = [&](ck::cpu::ThreadwiseGemmParam& param, float* current_mat_c) {
        if constexpr(std::is_same<Row, ALayout>::value && std::is_same<Row, BLayout>::value)
        {
-            assert(m % uk.Mr_ == 0 && n == uk.Nr_);
+            assert(m % uk.ThreadMr == 0 && n == uk.ThreadNr);
            FloatA* p_a = mat_a;
            float* p_c  = current_mat_c;
            param.p_a   = p_a;
            param.p_c   = p_c;
-            for(uint32_t i_m = 0; i_m < m; i_m += uk.Mr_)
+            for(uint32_t i_m = 0; i_m < m; i_m += uk.ThreadMr)
            {
                uk.Run(&param);
-                p_a += uk.Mr_ * k;
+                p_a += uk.ThreadMr * k;
-                p_c += uk.Mr_ * n;
+                p_c += uk.ThreadMr * n;
                param.p_a = p_a;
                param.p_c = p_c;
            }
        }
        else if constexpr(std::is_same<Row, ALayout>::value && std::is_same<Col, BLayout>::value)
        {
-            assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0);
+            assert(m % uk.ThreadMr == 0 && n % uk.ThreadNr == 0);
            FloatA* p_a = mat_a;
            float* p_c  = current_mat_c;
            param.p_a   = p_a;
            param.p_b   = mat_b;
            param.p_c   = p_c;
-            for(uint32_t i_m = 0; i_m < m; i_m += uk.Mr_)
+            for(uint32_t i_m = 0; i_m < m; i_m += uk.ThreadMr)
            {
                float* p_c_n  = p_c;
                FloatB* p_b_n = mat_b;
-                for(uint32_t i_n = 0; i_n < n; i_n += uk.Nr_)
+                for(uint32_t i_n = 0; i_n < n; i_n += uk.ThreadNr)
                {
                    uk.Run(&param);
-                    p_b_n += uk.Nr_ * k; // Nr_/8*k*8
+                    p_b_n += uk.ThreadNr * k; // ThreadNr/8*k*8
-                    p_c_n += uk.Nr_;
+                    p_c_n += uk.ThreadNr;
                    param.p_b = p_b_n;
                    param.p_c = p_c_n;
                }
-                p_a += uk.Mr_ * k;
+                p_a += uk.ThreadMr * k;
-                p_c += uk.Mr_ * n;
+                p_c += uk.ThreadMr * n;
                param.p_a = p_a;
                param.p_b = mat_b;
                param.p_c = p_c;
@@ -278,28 +278,28 @@ void test_ukernel(ukenrel_t uk,
        }
        else if constexpr(std::is_same<Col, ALayout>::value && std::is_same<Row, BLayout>::value)
        {
-            assert(m == uk.Mr_ && n == uk.Nr_);
+            assert(m == uk.ThreadMr && n == uk.ThreadNr);
            uk.Run(&param);
        }
        else
        {
-            assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0);
+            assert(m % uk.ThreadMr == 0 && n % uk.ThreadNr == 0);
            FloatB* p_b = mat_b;
            float* p_c  = current_mat_c;
            param.p_b   = p_b;
            param.p_c   = p_c;
-            for(uint32_t i_n = 0; i_n < n; i_n += uk.Nr_)
+            for(uint32_t i_n = 0; i_n < n; i_n += uk.ThreadNr)
            {
                uk.Run(&param);
-                p_b += uk.Nr_ * k; // Nr_/8*k*8
+                p_b += uk.ThreadNr * k; // ThreadNr/8*k*8
-                p_c += uk.Nr_;
+                p_c += uk.ThreadNr;
                param.p_b = p_b;
                param.p_c = p_c;
            }
        }
    };
-    printf("gemm_uk_%dx%d_%c%c: ", uk.Mr_, uk.Nr_, ALayout::name[0], BLayout::name[0]);
+    printf("gemm_uk_%dx%d_%c%c: ", uk.ThreadMr, uk.ThreadNr, ALayout::name[0], BLayout::name[0]);
    fflush(stdout);
    // printf("%s: ", typeid(uk).name());fflush(stdout);
@@ -309,8 +309,8 @@ void test_ukernel(ukenrel_t uk,
    {
        int tid = omp_get_thread_num();
        DeviceAlignedMemCPU private_c_mem(m * n * sizeof(float), 32);
-        float* private_c = reinterpret_cast<float*>(private_c_mem.mpDeviceBuf);
+        // float* private_c = reinterpret_cast<float*>(private_c_mem.mpDeviceBuf);
-        // float * private_c    = mat_c + tid * m * n;
+        float* private_c = mat_c + tid * m * n;
        ck::cpu::ThreadwiseGemmParam param;
        param.p_a   = mat_a;
@@ -386,10 +386,10 @@ void test_cpu_ukernel(float alpha, uint32_t m, uint32_t n, uint32_t k)
    ck::static_for<0, std::tuple_size_v<thread_gemm_instance>, 1>{}([&](auto i) {
        using uk_type = std::tuple_element_t<i, thread_gemm_instance>;
-        if(m % uk_type::Mr_ != 0 || n % uk_type::Nr_ != 0)
+        if(m % uk_type::ThreadMr != 0 || n % uk_type::ThreadNr != 0)
            return;
-        if((m != uk_type::Mr_ && std::is_same<typename uk_type::ALayout_, Col>::value) ||
+        if((m != uk_type::ThreadMr && std::is_same<typename uk_type::MatrixALayout, Col>::value) ||
-           (n != uk_type::Nr_ && std::is_same<typename uk_type::BLayout_, Row>::value))
+           (n != uk_type::ThreadNr && std::is_same<typename uk_type::MatrixBLayout, Row>::value))
            // only k is the fast changing dim of A/B can we do muldiplt m, n
            return;