Reorganize files, Part 1 (#119)

* delete obselete files * move files * build * update cmake * update cmake * fix build * reorg examples * update cmake for example and test

Reorganize files, Part 1 (#119)
* delete obselete files * move files * build * update cmake * update cmake * fix build * reorg examples * update cmake for example and test
5d37d7bf · Chao Liu · GitHub · 245f7414 · 5d37d7bf · 5d37d7bf
Unverified Commit 5d37d7bf authored Mar 08, 2022 by Chao Liu Committed by GitHub Mar 08, 2022
20 changed files
--- a/example/11_conv2d_bwd_wgt/CMakeLists.txt
+++ b/example/11_conv2d_bwd_wgt/CMakeLists.txt
+add_example_executable(example_conv2d_bwd_wgt_xdl conv2d_bwd_wgt_xdl.cpp)
--- a/example/13_conv2d_backward_weight_xdl/README.md
+++ b/example/13_conv2d_backward_weight_xdl/README.md
--- a/example/13_conv2d_backward_weight_xdl/main.cpp
+++ b/example/13_conv2d_backward_weight_xdl/main.cpp
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
--- a/example/13_reduce_blockwise/reduce_blockwise.cpp
+++ b/example/13_reduce_blockwise/reduce_blockwise.cpp
@@ -14,7 +14,6 @@
 #include "device_reduce_blockwise.hpp"
 #include "host_reduce_util.hpp"
 #include "host_generic_reduction.hpp"
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"

--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
+add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
--- a/example/12_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/12_pool2d_fwd/pool2d_fwd.cpp
@@ -12,7 +12,7 @@
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
 #include "reduction_operator.hpp"
-#include "device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "device_pool2d_fwd_nhwc_nhwc.hpp"
 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;

--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/README.md
-# Instructions for ```conv_xdl_bias_relu_add``` Example
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv_xdl_bias_relu_add
-```
-## Run ```conv_xdl_bias_relu_add```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv_xdl_bias_relu_add 0 1 5
-```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-arg.c0_grid_desc_m_n_{ 165888, 256}
-arg.c1_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
-```
--- a/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
+++ b/example/7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation.hpp"
-#include "convolution_utility.hpp"
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
-// clang-format off
-using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|             Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|    GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|   DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |                |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1,  8, 1, 1,32>,               2>;
-// clang-format on
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        float v_acc = 0;
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        float v_in;
-                        float v_wei;
-                        in_element_op(v_in, static_cast<const float>(in_n_c_hi_wi(n, c, hi, wi)));
-                        wei_element_op(v_wei, static_cast<const float>(wei_k_c_y_x(k, c, y, x)));
-                        v_acc += v_in * v_wei;
-                    }
-                }
-            }
-        }
-        float v_out;
-        out_element_op(v_out, v_acc, static_cast<float>(bias_k(k)));
-        out_n_k_ho_wo(n, k, ho, wo) += v_out;
-    };
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
-int main(int argc, char* argv[])
-{
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
-    // Conv shape
-    ck::index_t N               = 128;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 2;
-    ck::index_t conv_stride_w   = 2;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-    }
-    else if(argc == 19)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-        N               = std::stoi(argv[4]);
-        K               = std::stoi(argv[5]);
-        C               = std::stoi(argv[6]);
-        Y               = std::stoi(argv[7]);
-        X               = std::stoi(argv[8]);
-        Hi              = std::stoi(argv[9]);
-        Wi              = std::stoi(argv[10]);
-        conv_stride_h   = std::stoi(argv[11]);
-        conv_stride_w   = std::stoi(argv[12]);
-        conv_dilation_h = std::stoi(argv[13]);
-        conv_dilation_w = std::stoi(argv[14]);
-        in_left_pad_h   = std::stoi(argv[15]);
-        in_left_pad_w   = std::stoi(argv[16]);
-        in_right_pad_h  = std::stoi(argv[17]);
-        in_right_pad_w  = std::stoi(argv[18]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(0);
-    }
-    const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
-    const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
-    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
-    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
-    const auto output_spatial_lengths =
-        ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
-                                                                              {Y, X},
-                                                                              conv_filter_strides,
-                                                                              conv_filter_dilations,
-                                                                              input_left_pads,
-                                                                              input_right_pads);
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::KYXC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWK>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-        }
-    };
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
-    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_host_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_device_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    // bias: assume contiguous 1d vector
-    Tensor<OutDataType> bias_k(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
-        break;
-    default:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
-        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
-    }
-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
-    bias_device_buf.ToDevice(bias_k.mData.data());
-    auto conv    = DeviceConvFwdInstance{};
-    auto invoker = conv.MakeInvoker();
-    auto argument =
-        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
-                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                          static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-                          N,
-                          K,
-                          C,
-                          std::vector<ck::index_t>{Hi, Wi},
-                          std::vector<ck::index_t>{Y, X},
-                          std::vector<ck::index_t>{Ho, Wo},
-                          conv_filter_strides,
-                          conv_filter_dilations,
-                          input_left_pads,
-                          input_right_pads,
-                          InElementOp{},
-                          WeiElementOp{},
-                          OutElementOp{});
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device operator with the specified compilation parameters does "
-            "not support this problem");
-    }
-    float ave_time = invoker.Run(argument, nrepeat);
-    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                            sizeof(WeiDataType) * (K * C * Y * X) +
-                            sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-    if(do_verification)
-    {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
-        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-        check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
-    }
-}
--- a/example/9_conv2d_fwd_xdl_int8/README.md
+++ b/example/9_conv2d_fwd_xdl_int8/README.md
-# Instructions for ```conv2d_fwd_xdl``` Example
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv2d_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv2d_fwd_xdl
-```
-## Run ```conv2d_fwd_xdl_int8```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv2d_fwd_xdl_int8 0 1 5
-```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
-```
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
 include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include/ck
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
-    ${PROJECT_SOURCE_DIR}/host/device/include
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
-    ${PROJECT_SOURCE_DIR}/device_operation/include
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
-    ${PROJECT_SOURCE_DIR}/reference_operation/include
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
-    ${PROJECT_SOURCE_DIR}/device_operation_reference/include
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
+    ${PROJECT_SOURCE_DIR}/external/include/half
 )
-set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp)
+add_custom_target(examples)
-set(GEMM_XDL_INT8_SOURCE 1_gemm_xdl/gemm_xdl_int8.cpp)
-set(GEMM_XDL_BF16_SOURCE 1_gemm_xdl/gemm_xdl_bf16.cpp)
-set(GEMM_XDL_BIAS_RELU_SOURCE 2_gemm_xdl_bias_relu/gemm_xdl_bias_relu.cpp)
-set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 3_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_SOURCE 4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp)
-set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
-set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
-set(CONV2D_WRW_XDL_SOURCE 13_conv2d_backward_weight_xdl/main.cpp)
-set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
-set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
-set(CONV2D_BWD_DATA_XDL_SOURCE 12_conv2d_bwd_data_xdl/conv2d_bwd_data_xdl.cpp)
-set(POOL2D_FWD_SOURCE 12_pool2d_fwd/pool2d_fwd.cpp)
-set(REDUCE_BLOCKWISE_SOURCE 13_reduce_blockwise/reduce_blockwise.cpp)
-add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
+function(add_example_executable EXAMPLE_NAME)
-add_executable(gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
+    message("adding example ${EXAMPLE_NAME}")
-add_executable(gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
+    add_executable(${EXAMPLE_NAME} ${ARGN})
-add_executable(gemm_xdl_bias_relu ${GEMM_XDL_BIAS_RELU_SOURCE})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
-add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
+    add_dependencies(examples ${EXAMPLE_NAME})
-add_executable(conv2d_fwd_xdl ${CONV2D_FWD_XDL_SOURCE})
+endfunction(add_example_executable EXAMPLE_NAME)
-add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_atomic_add ${CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE})
-add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
-add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
-add_executable(conv2d_wrw_xdl ${CONV2D_WRW_XDL_SOURCE})
-add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
-add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
-add_executable(conv2d_bwd_data_xdl ${CONV2D_BWD_DATA_XDL_SOURCE})
-add_executable(pool2d_fwd ${POOL2D_FWD_SOURCE})
-add_executable(reduce_blockwise ${REDUCE_BLOCKWISE_SOURCE})
-target_link_libraries(gemm_xdl PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bf16 PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_atomic_add PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(conv2d_wrw_xdl PRIVATE host_tensor)
-target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(conv2d_bwd_data_xdl PRIVATE host_tensor)
-target_link_libraries(pool2d_fwd PRIVATE host_tensor)
-target_link_libraries(reduce_blockwise PRIVATE host_tensor)
+add_subdirectory(01_gemm)
+add_subdirectory(02_gemm_alpha_beta)
+add_subdirectory(03_gemm_bias_relu)
+add_subdirectory(04_gemm_bias_relu_add)
+add_subdirectory(05_conv2d_fwd)
+add_subdirectory(06_conv2d_fwd_bias_relu)
+add_subdirectory(07_conv2d_fwd_bias_relu_add)
+add_subdirectory(08_conv3d_fwd)
+add_subdirectory(09_convnd_fwd)
+add_subdirectory(10_conv2d_bwd_data)
+add_subdirectory(11_conv2d_bwd_wgt)
+add_subdirectory(12_reduce)
+add_subdirectory(13_pool2d_fwd)
--- a/external/half/include/half.hpp
+++ b/external/half/include/half.hpp
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
-add_subdirectory(host_tensor)
--- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
-#include <numeric>
-#include <sstream>
-namespace ck {
-namespace driver {
-struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    auto GetCompileParameterString() const
-    {
-        auto param = std::stringstream();
-        // clang-format off
-        param <<
-            " -DCK_PARAM_ABDataTypeEnum=" << 
-                ABDataTypeEnum <<
-            " -DCK_PARAM_AccDataTypeEnum=" << 
-                AccDataTypeEnum <<
-            " -DCK_PARAM_CDataTypeEnum=" << 
-                CDataTypeEnum <<
-            " -DCK_PARAM_BlockSize=" << 
-                BlockSize <<
-            " -DCK_PARAM_GN0=" << 
-                GN0 <<
-            " -DCK_PARAM_GK1=" << 
-                GK1 <<
-            " -DCK_PARAM_GM1PerBlockGM11=" 
-                << GM1PerBlockGM11 <<
-            " -DCK_PARAM_GN1PerBlockGN11=" <<
-                GN1PerBlockGN11 <<
-            " -DCK_PARAM_GK0PerBlock=" <<
-                GK0PerBlock <<
-            " -DCK_PARAM_BM1PerThreadBM11=" <<
-                BM1PerThreadBM11 <<
-            " -DCK_PARAM_BN1PerThreadBN11=" <<
-                BN1PerThreadBN11 <<
-            " -DCK_PARAM_BK0PerThread=" <<
-                BK0PerThread <<
-            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
-                BM10BN10ThreadClusterBM10Xs[0] << "," <<
-                BM10BN10ThreadClusterBM10Xs[1] <<
-            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
-                BM10BN10ThreadClusterBN10Xs[0] << "," <<
-                BM10BN10ThreadClusterBN10Xs[1] <<
-            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] << 
-            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
-            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
-                CThreadTransferDstScalarPerVector <<
-            " -DCK_PARAM_HasMainKBlockLoop=" <<
-                static_cast<int>(HasMainKBlockLoop) <<
-            " -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
-                static_cast<int>(HasDoubleTailKBlockLoop);
-        // clang-format on
-        return param.str();
-    }
-    ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
-    ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
-    ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;
-    int BlockSize = -1;
-    int GN0 = -1;
-    int GK1 = -1;
-    int GM1PerBlockGM11 = -1;
-    int GN1PerBlockGN11 = -1;
-    int GK0PerBlock     = -1;
-    int BM1PerThreadBM11 = -1;
-    int BN1PerThreadBN11 = -1;
-    int BK0PerThread     = -1;
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    int CThreadTransferDstScalarPerVector = -1;
-    bool HasMainKBlockLoop       = false;
-    bool HasDoubleTailKBlockLoop = false;
-};
-struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
-    int BlockSize;
-    int GN0;
-    int GK1;
-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-};
-inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
-{
-    constexpr auto f32 = ck::DataTypeEnum_t::Float;
-    constexpr auto f16 = ck::DataTypeEnum_t::Half;
-    constexpr auto i8  = ck::DataTypeEnum_t::Int8;
-    return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
-        // clang-format off
-        // fp32
-        {f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 256, 2, 1, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 256, 4, 1, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 256, 8, 1, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 128, 1, 1,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        // fp16
-        {f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 256, 2, 2, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 256, 4, 2, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 256, 8, 2, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 128, 1, 2,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        // i8
-        { i8,  i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 256, 2, 4, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 256, 4, 4, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 256, 8, 4, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 128, 1, 4,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
-        // clang-format on
-    };
-}
-// TODO make this common interface and write specs for it
-struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    static auto
-    CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
-                                            const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
-    {
-        const int C  = conv_problem_desc.C;
-        const int Y  = conv_problem_desc.Y;
-        const int X  = conv_problem_desc.X;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-        if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
-             conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
-             conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-        const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
-        const auto CDataTypeEnum  = conv_problem_desc.OutDataTypeEnum;
-        DataTypeEnum_t AccDataTypeEnum;
-        if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
-        {
-            AccDataTypeEnum = DataTypeEnum_t::Float;
-        }
-        else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
-        {
-            AccDataTypeEnum = DataTypeEnum_t::Int32;
-        }
-        else
-        {
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-        }
-        const int BlockSize = tunable.BlockSize;
-        const int GN0 = tunable.GN0;
-        const int GK1 = tunable.GK1;
-        const int GM11        = tunable.GM1PerBlockGM11;
-        const int GN11        = tunable.GN1PerBlockGN11;
-        const int GK0PerBlock = tunable.GK0PerBlock;
-        const int BM11         = tunable.BM1PerThreadBM11;
-        const int BN11         = tunable.BN1PerThreadBN11;
-        const int BK0PerThread = tunable.BK0PerThread;
-        const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
-        const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
-        const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-        // C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
-        const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
-        const int C0 = GK1;
-        if(!(C % C0 == 0))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-        const int C1 = C / C0;
-        const int GK0 = C1 * Y * X;
-        if(!(GK0 % GK0PerBlock == 0))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-        const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
-        const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
-        return std::make_tuple(
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
-                ABDataTypeEnum,
-                AccDataTypeEnum,
-                CDataTypeEnum,
-                BlockSize,
-                GN0,
-                GK1,
-                GM11,
-                GN11,
-                GK0PerBlock,
-                BM11,
-                BN11,
-                BK0PerThread,
-                BM10BN10ThreadClusterBM10Xs,
-                BM10BN10ThreadClusterBN10Xs,
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-                CThreadTransferDstScalarPerVector,
-                HasMainKBlockLoop,
-                HasDoubleTailKBlockLoop},
-            true);
-    }
-    static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
-    {
-        for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
-        {
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
-            bool found = false;
-            std::tie(compile_param, found) =
-                CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
-            if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
-                return std::make_tuple(compile_param, true);
-        }
-        return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-    }
-    static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
-    {
-        bool found = false;
-        std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
-        return found;
-    }
-    static bool
-    IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
-                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        const int N  = conv_problem_desc.N;
-        const int K  = conv_problem_desc.K;
-        const int C  = conv_problem_desc.C;
-        const int Y  = conv_problem_desc.Y;
-        const int X  = conv_problem_desc.X;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-        const int GK1  = compile_param.GK1;
-        const int GN0  = compile_param.GN0;
-        const int GM11 = compile_param.GM1PerBlockGM11;
-        const int GN11 = compile_param.GN1PerBlockGN11;
-        const int BM11 = compile_param.BM1PerThreadBM11;
-        const int BN11 = compile_param.BN1PerThreadBN11;
-        const int C0 = GK1;
-        const int N0 = GN0;
-        if(!(C % C0 == 0))
-            return false;
-        const int C1 = C / C0;
-        if(!(N % N0 == 0))
-            return false;
-        const int N1 = N / N0;
-        const int GM0 = 1;
-        const int GM1 = K;
-        const int GN1 = N1 * Ho * Wo;
-        const int GK0 = C1 * Y * X;
-        // check data type
-        {
-            if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
-                 conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
-                return false;
-            if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
-               compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
-            {
-                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
-                    return false;
-            }
-            else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
-            {
-                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
-                    return false;
-            }
-        }
-        // check gridwise contraction
-        {
-            if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
-                return false;
-            const bool has_main_k_block_loop =
-                ((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
-            const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
-            if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
-                 has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
-                return false;
-        }
-        // check A blockwise copy
-        {
-            const auto block_slice_lengths =
-                std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
-            const auto& cluster_lengths =
-                compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& thread_slice_lengths =
-                compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& src_vector_lengths =
-                compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& dst_vector_lengths =
-                compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-            // check number of working thread
-            const int num_work_thread = std::accumulate(
-                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
-            if(!(compile_param.BlockSize >= num_work_thread))
-                return false;
-            // check block slice lengths vs thread slice lengths vs cluster lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
-                    return false;
-            }
-            // check thread slice lengths vs vector lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
-                    return false;
-                if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
-                    return false;
-            }
-            // check Src vectorization, GK0 is global mem vector dim
-            if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
-                 src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
-                return false;
-            // check Dst vectorization, {GM11, GK1} are LDS vector dims
-            if(dst_vector_lengths[4] == GK1)
-            { // vectorize on {GM11, GK1}
-                if(!(GM11 % dst_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            { // vectorize on {GK1} only
-                if(!(GK1 % dst_vector_lengths[4] == 0))
-                    return false;
-                if(!(dst_vector_lengths[3] == 1))
-                    return false;
-            }
-        }
-        // check B blockwise copy
-        {
-            const auto block_slice_lengths =
-                std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
-            const auto& cluster_lengths =
-                compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& thread_slice_lengths =
-                compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& src_vector_lengths =
-                compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& dst_vector_lengths =
-                compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-            // check number of working thread
-            const int num_work_thread = std::accumulate(
-                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
-            if(!(compile_param.BlockSize >= num_work_thread))
-                return false;
-            // check block slice lengths vs thread slice lengths vs cluster lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
-                    return false;
-            }
-            // check thread slice lengths vs vector lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
-                     thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
-                    return false;
-            }
-            // check Src vectorization: {GN11} is global mem vector dim
-            if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
-                 src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
-                return false;
-            // check Src tensor layout related vectorization
-            if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
-               conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
-               conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
-               conv_problem_desc.InRightPadW == 0)
-            {
-                if(!((Ho * Wo) % src_vector_lengths[3] == 0))
-                    return false;
-            }
-            else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
-                    conv_problem_desc.InRightPadW == 0)
-            {
-                if(!(Wo % src_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            {
-                if(!(src_vector_lengths[3] == 1))
-                    return false;
-            }
-            // check Dst vectorization: {GN11, GK1} are LDS vector dims
-            if(dst_vector_lengths[4] == GK1)
-            { // vectorize on {GN11, GK1}
-                if(!(GN11 % dst_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            { // vectorize on {GK1} only
-                if(!(dst_vector_lengths[3] == 1))
-                    return false;
-                if(!(GK1 % dst_vector_lengths[4] == 0))
-                    return false;
-            }
-        }
-        // check blockwise GEMM
-        {
-            const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
-                                             compile_param.BM10BN10ThreadClusterBM10Xs.end(),
-                                             1,
-                                             std::multiplies<int>{});
-            const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
-                                             compile_param.BM10BN10ThreadClusterBN10Xs.end(),
-                                             1,
-                                             std::multiplies<int>{});
-            if(!(compile_param.BlockSize == BM10 * BN10))
-                return false;
-            const int BM = GM0 * GM11;
-            const int BN = GN0 * GN11;
-            const int BM1 = BM10 * BM11;
-            const int BN1 = BN10 * BN11;
-            if(!(BM % BM1 == 0 && BN % BN1 == 0))
-                return false;
-            const int BM0 = BM / BM1;
-            const int BN0 = BN / BN1;
-            // blockwise GEMM currently only support BM0 == 2 && BN0 == 2
-            if(!(BM0 == 2 && BN0 == 2))
-                return false;
-            if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
-                return false;
-        }
-        // check C threadwise copy
-        {
-            // {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
-            const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
-            // check slice length vs Dst vector length:
-            if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
-                return false;
-            // check Dst memory layout related vectorization:
-            if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
-                return false;
-        }
-        return true;
-    };
-    static int GetBlockSize(const ConvolutionProblemDescriptor&,
-                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        return compile_param.BlockSize;
-    }
-    static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
-                           const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        const int N  = conv_problem_desc.N;
-        const int K  = conv_problem_desc.K;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-        const int N0 = compile_param.GN0;
-        const int N1 = N / N0;
-        const int GM1 = K;
-        const int GN1 = N1 * Ho * Wo;
-        const int GM11 = compile_param.GM1PerBlockGM11;
-        const int GN11 = compile_param.GN1PerBlockGN11;
-        const int GM10 = GM1 / GM11;
-        const int GN10 = GN1 / GN11;
-        return GM10 * GN10;
-    }
-    static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
-                                        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
-    {
-        // workspace is used for save transformed tensor descritpors created by prepare kernel
-        return 4096L;
-    }
-    static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
-    static auto GetTunableList()
-    {
-        return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
-    }
-};
-} // namespace driver
-} // namespace ck
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-    int M1PerThread;
-    int N1PerThread;
-    int KPerThread;
-    int M1N1ThreadClusterM10;
-    int M1N1ThreadClusterN10;
-    int M1N1ThreadClusterM11;
-    int M1N1ThreadClusterN11;
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_M1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_N1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 6> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
-        256,       128,       128, 8, 4,         4,           1,
-        8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
-        {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
-        {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
-        5,         1};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-    int MPerXDL;
-    int NPerXDL;
-    int K1;
-    int MRepeat;
-    int NRepeat;
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerXDL,
-        32,                       // NPerXDL,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        1,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {0, 2, 1},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        1,                        // BBlockTransferSrcVectorDim
-        1,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-{
-    int BlockSize;
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-    int MPerWave;
-    int NPerWave;
-    int K1;
-    int MRepeat;
-    int NRepeat;
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerWave,
-        32,                       // NPerWave,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        4,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {1, 0, 2},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        2,                        // BBlockTransferSrcVectorDim
-        4,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/convolution_problem_descriptor.hpp
+++ b/host/solver/include/convolution_problem_descriptor.hpp
-#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
-#define CONVOLUTION_PROBLEM_DESCRIPTOR
-namespace ck {
-namespace driver {
-struct ConvolutionProblemDescriptor
-{
-    ConvolutionProblemDescriptor() = default;
-    ConvolutionProblemDescriptor(int N_,
-                                 int K_,
-                                 int C_,
-                                 int Y_,
-                                 int X_,
-                                 int Hi_,
-                                 int Wi_,
-                                 int Ho_,
-                                 int Wo_,
-                                 int ConvStrideH_,
-                                 int ConvStrideW_,
-                                 int ConvDilationH_,
-                                 int ConvDilationW_,
-                                 int InLeftPadH_,
-                                 int InLeftPadW_,
-                                 int InRightPadH_,
-                                 int InRightPadW_,
-                                 ck::DataTypeEnum_t InDataTypeEnum_,
-                                 ck::DataTypeEnum_t WeiDataTypeEnum_,
-                                 ck::DataTypeEnum_t OutDataTypeEnum_)
-        : N{N_},
-          K{K_},
-          C{C_},
-          Y{Y_},
-          X{X_},
-          Hi{Hi_},
-          Wi{Wi_},
-          Ho{Ho_},
-          Wo{Wo_},
-          ConvStrideH{ConvStrideH_},
-          ConvStrideW{ConvStrideW_},
-          ConvDilationH{ConvDilationH_},
-          ConvDilationW{ConvDilationW_},
-          InLeftPadH{InLeftPadH_},
-          InLeftPadW{InLeftPadW_},
-          InRightPadH{InRightPadH_},
-          InRightPadW{InRightPadW_},
-          InDataTypeEnum{InDataTypeEnum_},
-          WeiDataTypeEnum{WeiDataTypeEnum_},
-          OutDataTypeEnum{OutDataTypeEnum_}
-    {
-    }
-    int N;
-    int K;
-    int C;
-    int Y;
-    int X;
-    int Hi;
-    int Wi;
-    int Ho;
-    int Wo;
-    int ConvStrideH;
-    int ConvStrideW;
-    int ConvDilationH;
-    int ConvDilationW;
-    int InLeftPadH;
-    int InLeftPadW;
-    int InRightPadH;
-    int InRightPadW;
-    ck::DataTypeEnum_t InDataTypeEnum;
-    ck::DataTypeEnum_t WeiDataTypeEnum;
-    ck::DataTypeEnum_t OutDataTypeEnum;
-    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
-};
-} // namespace driver
-} // namespace ck
-#endif
--- a/host/solver/include/solver_common.hpp
+++ b/host/solver/include/solver_common.hpp
-#ifndef CK_SOLVER_COMMON_HPP
-#define CK_SOLVER_COMMON_HPP
-namespace ck {
-namespace driver {
-// greatest common divisor, aka highest common factor
-inline int gcd(int x, int y)
-{
-    if(x < 0)
-    {
-        return gcd(-x, y);
-    }
-    else if(y < 0)
-    {
-        return gcd(x, -y);
-    }
-    else if(x == y || x == 0)
-    {
-        return y;
-    }
-    else if(y == 0)
-    {
-        return x;
-    }
-    else if(x > y)
-    {
-        return gcd(x % y, y);
-    }
-    else
-    {
-        return gcd(x, y % x);
-    }
-}
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-auto gcd(X x, Ys... ys)
-{
-    return gcd(x, gcd(ys...));
-}
-} // namespace driver
-} // namespace ck
-#endif
--- a/composable_kernel/include/config.hpp
+++ b/composable_kernel/include/config.hpp