merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
 include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/device/include
-    ${PROJECT_SOURCE_DIR}/device_operation/include
-    ${PROJECT_SOURCE_DIR}/reference_operation/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/device_operation_reference/include
+    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor
+    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
+    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
+    ${PROJECT_SOURCE_DIR}/external/include/half
 )

-set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp)
-set(GEMM_XDL_BIAS_RELU_SOURCE 2_gemm_xdl_bias_relu/gemm_xdl_bias_relu.cpp)
-set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 3_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_SOURCE 4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
-set(CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_bias_relu_atomic_add/conv2d_fwd_xdl_bias_relu_atomic_add.cpp)
-set(GEMM_XDL_ALPHA_BETA_SOURCE 8_gemm_xdl_alpha_beta/gemm_xdl_alpha_beta.cpp)
-set(CONV2D_FWD_XDL_INT8_SOURCE 9_conv2d_fwd_xdl_int8/conv2d_fwd_xdl_int8.cpp)
-set(CONV3D_FWD_XDL_SOURCE 10_conv3d_fwd_xdl/conv3d_fwd_xdl.cpp)
-set(CONVND_FWD_XDL_SOURCE 11_convnd_fwd_xdl/convnd_fwd_xdl.cpp)
-set(GROUPED_GEMM_XDL_SOURCE 12_grouped_gemm_xdl/grouped_gemm_xdl.cpp)
+add_custom_target(examples)

-add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
-add_executable(grouped_gemm_xdl ${GROUPED_GEMM_XDL_SOURCE})
-add_executable(gemm_xdl_bias_relu ${GEMM_XDL_BIAS_RELU_SOURCE})
-add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
-add_executable(conv2d_fwd_xdl ${CONV2D_FWD_XDL_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
-add_executable(conv2d_fwd_xdl_bias_relu_atomic_add ${CONV2D_FWD_XDL_BIAS_RELU_ATOMIC_ADD_SOURCE})
-add_executable(gemm_xdl_alpha_beta ${GEMM_XDL_ALPHA_BETA_SOURCE})
-add_executable(conv2d_fwd_xdl_int8 ${CONV2D_FWD_XDL_INT8_SOURCE})
-add_executable(conv3d_fwd_xdl ${CONV3D_FWD_XDL_SOURCE})
-add_executable(convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
+function(add_example_executable EXAMPLE_NAME)
+    message("adding example ${EXAMPLE_NAME}")
+    add_executable(${EXAMPLE_NAME} ${ARGN})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+    add_dependencies(examples ${EXAMPLE_NAME})
+endfunction(add_example_executable EXAMPLE_NAME)

-target_link_libraries(gemm_xdl PRIVATE host_tensor)
-target_link_libraries(grouped_gemm_xdl PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_bias_relu_atomic_add PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
-target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
-target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
+add_subdirectory(01_gemm)
+add_subdirectory(02_gemm_alpha_beta)
+add_subdirectory(03_gemm_bias_relu)
+add_subdirectory(04_gemm_bias_relu_add)
+add_subdirectory(05_conv2d_fwd)
+add_subdirectory(06_conv2d_fwd_bias_relu)
+add_subdirectory(07_conv2d_fwd_bias_relu_add)
+add_subdirectory(08_conv3d_fwd)
+add_subdirectory(09_convnd_fwd)
+add_subdirectory(10_conv2d_bwd_data)
+add_subdirectory(11_conv2d_bwd_wgt)
+add_subdirectory(12_reduce)
+add_subdirectory(13_pool2d_fwd)
+add_subdirectory(14_grouped_gemm)
--- a/external/half/include/half.hpp
+++ b/external/half/include/half.hpp
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
-add_subdirectory(host_tensor)
\ No newline at end of file
--- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
-
-#include <numeric>
-#include <sstream>
-
-namespace ck {
-namespace driver {
-
-struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    auto GetCompileParameterString() const
-    {
-        auto param = std::stringstream();
-
-        // clang-format off
-        param <<
-            " -DCK_PARAM_ABDataTypeEnum=" << 
-                ABDataTypeEnum <<
-            " -DCK_PARAM_AccDataTypeEnum=" << 
-                AccDataTypeEnum <<
-            " -DCK_PARAM_CDataTypeEnum=" << 
-                CDataTypeEnum <<
-            " -DCK_PARAM_BlockSize=" << 
-                BlockSize <<
-            " -DCK_PARAM_GN0=" << 
-                GN0 <<
-            " -DCK_PARAM_GK1=" << 
-                GK1 <<
-            " -DCK_PARAM_GM1PerBlockGM11=" 
-                << GM1PerBlockGM11 <<
-            " -DCK_PARAM_GN1PerBlockGN11=" <<
-                GN1PerBlockGN11 <<
-            " -DCK_PARAM_GK0PerBlock=" <<
-                GK0PerBlock <<
-            " -DCK_PARAM_BM1PerThreadBM11=" <<
-                BM1PerThreadBM11 <<
-            " -DCK_PARAM_BN1PerThreadBN11=" <<
-                BN1PerThreadBN11 <<
-            " -DCK_PARAM_BK0PerThread=" <<
-                BK0PerThread <<
-            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
-                BM10BN10ThreadClusterBM10Xs[0] << "," <<
-                BM10BN10ThreadClusterBM10Xs[1] <<
-            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
-                BM10BN10ThreadClusterBN10Xs[0] << "," <<
-                BM10BN10ThreadClusterBN10Xs[1] <<
-            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] << 
-            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
-            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
-            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
-            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
-                CThreadTransferDstScalarPerVector <<
-            " -DCK_PARAM_HasMainKBlockLoop=" <<
-                static_cast<int>(HasMainKBlockLoop) <<
-            " -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
-                static_cast<int>(HasDoubleTailKBlockLoop);
-        // clang-format on
-
-        return param.str();
-    }
-
-    ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
-    ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
-    ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;
-
-    int BlockSize = -1;
-
-    int GN0 = -1;
-    int GK1 = -1;
-
-    int GM1PerBlockGM11 = -1;
-    int GN1PerBlockGN11 = -1;
-    int GK0PerBlock     = -1;
-
-    int BM1PerThreadBM11 = -1;
-    int BN1PerThreadBN11 = -1;
-    int BK0PerThread     = -1;
-
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};
-
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        -1, -1, -1, -1, -1};
-
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        -1, -1, -1, -1, -1};
-
-    int CThreadTransferDstScalarPerVector = -1;
-
-    bool HasMainKBlockLoop       = false;
-    bool HasDoubleTailKBlockLoop = false;
-};
-
-struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
-
-    int BlockSize;
-
-    int GN0;
-    int GK1;
-
-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
-
-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
-
-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
-
-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-
-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-};
-
-inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
-{
-    constexpr auto f32 = ck::DataTypeEnum_t::Float;
-    constexpr auto f16 = ck::DataTypeEnum_t::Half;
-    constexpr auto i8  = ck::DataTypeEnum_t::Int8;
-
-    return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
-        // clang-format off
-        // fp32
-        {f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-
-        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 256, 2, 1, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f32, f32, 256, 4, 1, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        {f32, f32, 256, 8, 1, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        {f32, f32, 128, 1, 1,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        // fp16
-        {f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-
-        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 256, 2, 2, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        {f16, f16, 256, 4, 2, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        {f16, f16, 256, 8, 2, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        {f16, f16, 128, 1, 2,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        // i8
-        { i8,  i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
-
-        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 256, 2, 4, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-        { i8,  i8, 256, 4, 4, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        { i8,  i8, 256, 8, 4, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-
-        { i8,  i8, 128, 1, 4,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
-        // clang-format on
-    };
-}
-
-// TODO make this common interface and write specs for it
-struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
-{
-    static auto
-    CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
-                                            const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
-    {
-        const int C  = conv_problem_desc.C;
-        const int Y  = conv_problem_desc.Y;
-        const int X  = conv_problem_desc.X;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-
-        if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
-             conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
-             conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-
-        const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
-        const auto CDataTypeEnum  = conv_problem_desc.OutDataTypeEnum;
-
-        DataTypeEnum_t AccDataTypeEnum;
-
-        if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
-        {
-            AccDataTypeEnum = DataTypeEnum_t::Float;
-        }
-        else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
-        {
-            AccDataTypeEnum = DataTypeEnum_t::Int32;
-        }
-        else
-        {
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-        }
-
-        const int BlockSize = tunable.BlockSize;
-
-        const int GN0 = tunable.GN0;
-        const int GK1 = tunable.GK1;
-
-        const int GM11        = tunable.GM1PerBlockGM11;
-        const int GN11        = tunable.GN1PerBlockGN11;
-        const int GK0PerBlock = tunable.GK0PerBlock;
-
-        const int BM11         = tunable.BM1PerThreadBM11;
-        const int BN11         = tunable.BN1PerThreadBN11;
-        const int BK0PerThread = tunable.BK0PerThread;
-
-        const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
-        const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
-
-        const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-        const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
-            tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-
-        const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-        const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
-            tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-
-        // C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
-        const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
-
-        const int C0 = GK1;
-
-        if(!(C % C0 == 0))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-
-        const int C1 = C / C0;
-
-        const int GK0 = C1 * Y * X;
-
-        if(!(GK0 % GK0PerBlock == 0))
-            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-
-        const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
-
-        const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
-
-        return std::make_tuple(
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
-                ABDataTypeEnum,
-                AccDataTypeEnum,
-                CDataTypeEnum,
-                BlockSize,
-                GN0,
-                GK1,
-                GM11,
-                GN11,
-                GK0PerBlock,
-                BM11,
-                BN11,
-                BK0PerThread,
-                BM10BN10ThreadClusterBM10Xs,
-                BM10BN10ThreadClusterBN10Xs,
-                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-                CThreadTransferDstScalarPerVector,
-                HasMainKBlockLoop,
-                HasDoubleTailKBlockLoop},
-            true);
-    }
-
-    static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
-    {
-        for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
-        {
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
-            bool found = false;
-
-            std::tie(compile_param, found) =
-                CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
-
-            if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
-                return std::make_tuple(compile_param, true);
-        }
-
-        return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
-    }
-
-    static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
-    {
-        bool found = false;
-
-        std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
-
-        return found;
-    }
-
-    static bool
-    IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
-                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        const int N  = conv_problem_desc.N;
-        const int K  = conv_problem_desc.K;
-        const int C  = conv_problem_desc.C;
-        const int Y  = conv_problem_desc.Y;
-        const int X  = conv_problem_desc.X;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-
-        const int GK1  = compile_param.GK1;
-        const int GN0  = compile_param.GN0;
-        const int GM11 = compile_param.GM1PerBlockGM11;
-        const int GN11 = compile_param.GN1PerBlockGN11;
-
-        const int BM11 = compile_param.BM1PerThreadBM11;
-        const int BN11 = compile_param.BN1PerThreadBN11;
-
-        const int C0 = GK1;
-        const int N0 = GN0;
-
-        if(!(C % C0 == 0))
-            return false;
-
-        const int C1 = C / C0;
-
-        if(!(N % N0 == 0))
-            return false;
-
-        const int N1 = N / N0;
-
-        const int GM0 = 1;
-        const int GM1 = K;
-        const int GN1 = N1 * Ho * Wo;
-        const int GK0 = C1 * Y * X;
-
-        // check data type
-        {
-            if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
-                 conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
-                return false;
-
-            if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
-               compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
-            {
-                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
-                    return false;
-            }
-            else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
-            {
-                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
-                    return false;
-            }
-        }
-
-        // check gridwise contraction
-        {
-            if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
-                return false;
-
-            const bool has_main_k_block_loop =
-                ((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
-
-            const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
-
-            if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
-                 has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
-                return false;
-        }
-
-        // check A blockwise copy
-        {
-            const auto block_slice_lengths =
-                std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
-            const auto& cluster_lengths =
-                compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& thread_slice_lengths =
-                compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& src_vector_lengths =
-                compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-            const auto& dst_vector_lengths =
-                compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-
-            // check number of working thread
-            const int num_work_thread = std::accumulate(
-                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
-
-            if(!(compile_param.BlockSize >= num_work_thread))
-                return false;
-
-            // check block slice lengths vs thread slice lengths vs cluster lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
-                    return false;
-            }
-
-            // check thread slice lengths vs vector lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
-                    return false;
-
-                if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
-                    return false;
-            }
-
-            // check Src vectorization, GK0 is global mem vector dim
-            if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
-                 src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
-                return false;
-
-            // check Dst vectorization, {GM11, GK1} are LDS vector dims
-            if(dst_vector_lengths[4] == GK1)
-            { // vectorize on {GM11, GK1}
-                if(!(GM11 % dst_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            { // vectorize on {GK1} only
-                if(!(GK1 % dst_vector_lengths[4] == 0))
-                    return false;
-
-                if(!(dst_vector_lengths[3] == 1))
-                    return false;
-            }
-        }
-
-        // check B blockwise copy
-        {
-            const auto block_slice_lengths =
-                std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
-            const auto& cluster_lengths =
-                compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& thread_slice_lengths =
-                compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& src_vector_lengths =
-                compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-            const auto& dst_vector_lengths =
-                compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-
-            // check number of working thread
-            const int num_work_thread = std::accumulate(
-                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
-
-            if(!(compile_param.BlockSize >= num_work_thread))
-                return false;
-
-            // check block slice lengths vs thread slice lengths vs cluster lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
-                    return false;
-            }
-
-            // check thread slice lengths vs vector lengths
-            for(int i = 0; i < 5; ++i)
-            {
-                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
-                     thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
-                    return false;
-            }
-
-            // check Src vectorization: {GN11} is global mem vector dim
-            if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
-                 src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
-                return false;
-
-            // check Src tensor layout related vectorization
-            if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
-               conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
-               conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
-               conv_problem_desc.InRightPadW == 0)
-            {
-                if(!((Ho * Wo) % src_vector_lengths[3] == 0))
-                    return false;
-            }
-            else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
-                    conv_problem_desc.InRightPadW == 0)
-            {
-                if(!(Wo % src_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            {
-                if(!(src_vector_lengths[3] == 1))
-                    return false;
-            }
-
-            // check Dst vectorization: {GN11, GK1} are LDS vector dims
-            if(dst_vector_lengths[4] == GK1)
-            { // vectorize on {GN11, GK1}
-                if(!(GN11 % dst_vector_lengths[3] == 0))
-                    return false;
-            }
-            else
-            { // vectorize on {GK1} only
-                if(!(dst_vector_lengths[3] == 1))
-                    return false;
-
-                if(!(GK1 % dst_vector_lengths[4] == 0))
-                    return false;
-            }
-        }
-
-        // check blockwise GEMM
-        {
-            const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
-                                             compile_param.BM10BN10ThreadClusterBM10Xs.end(),
-                                             1,
-                                             std::multiplies<int>{});
-
-            const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
-                                             compile_param.BM10BN10ThreadClusterBN10Xs.end(),
-                                             1,
-                                             std::multiplies<int>{});
-
-            if(!(compile_param.BlockSize == BM10 * BN10))
-                return false;
-
-            const int BM = GM0 * GM11;
-            const int BN = GN0 * GN11;
-
-            const int BM1 = BM10 * BM11;
-            const int BN1 = BN10 * BN11;
-
-            if(!(BM % BM1 == 0 && BN % BN1 == 0))
-                return false;
-
-            const int BM0 = BM / BM1;
-            const int BN0 = BN / BN1;
-
-            // blockwise GEMM currently only support BM0 == 2 && BN0 == 2
-            if(!(BM0 == 2 && BN0 == 2))
-                return false;
-
-            if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
-                return false;
-        }
-
-        // check C threadwise copy
-        {
-            // {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
-            const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
-
-            // check slice length vs Dst vector length:
-            if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
-                return false;
-
-            // check Dst memory layout related vectorization:
-            if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
-                return false;
-        }
-
-        return true;
-    };
-
-    static int GetBlockSize(const ConvolutionProblemDescriptor&,
-                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        return compile_param.BlockSize;
-    }
-
-    static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
-                           const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
-    {
-        const int N  = conv_problem_desc.N;
-        const int K  = conv_problem_desc.K;
-        const int Ho = conv_problem_desc.Ho;
-        const int Wo = conv_problem_desc.Wo;
-
-        const int N0 = compile_param.GN0;
-        const int N1 = N / N0;
-
-        const int GM1 = K;
-        const int GN1 = N1 * Ho * Wo;
-
-        const int GM11 = compile_param.GM1PerBlockGM11;
-        const int GN11 = compile_param.GN1PerBlockGN11;
-
-        const int GM10 = GM1 / GM11;
-        const int GN10 = GN1 / GN11;
-
-        return GM10 * GN10;
-    }
-
-    static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
-                                        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
-    {
-        // workspace is used for save transformed tensor descritpors created by prepare kernel
-        return 4096L;
-    }
-
-    static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
-
-    static auto GetTunableList()
-    {
-        return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
-    }
-};
-
-} // namespace driver
-} // namespace ck
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int M1PerThread;
-    int N1PerThread;
-    int KPerThread;
-
-    int M1N1ThreadClusterM10;
-    int M1N1ThreadClusterN10;
-    int M1N1ThreadClusterM11;
-    int M1N1ThreadClusterN11;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_M1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_N1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 6> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
-        256,       128,       128, 8, 4,         4,           1,
-        8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
-        {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
-        {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
-        5,         1};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int MPerXDL;
-    int NPerXDL;
-    int K1;
-
-    int MRepeat;
-    int NRepeat;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerXDL,
-        32,                       // NPerXDL,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        1,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {0, 2, 1},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        1,                        // BBlockTransferSrcVectorDim
-        1,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
-
-struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-{
-    int BlockSize;
-
-    int MPerBlock;
-    int NPerBlock;
-    int KPerBlock;
-
-    int MPerWave;
-    int NPerWave;
-    int K1;
-
-    int MRepeat;
-    int NRepeat;
-
-    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int ABlockTransferSrcVectorDim;
-    int ABlockTransferSrcScalarPerVector;
-    int ABlockTransferDstScalarPerVector_K1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int BBlockTransferSrcVectorDim;
-    int BBlockTransferSrcScalarPerVector;
-    int BBlockTransferDstScalarPerVector_K1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-
-    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int CThreadTransferSrcDstVectorDim;
-    int CThreadTransferDstScalarPerVector;
-};
-
-static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-    default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
-        256,                      // BlockSize
-        128,                      // MPerBlock,
-        128,                      // NPerBlock,
-        4,                        // KPerBlock,
-        32,                       // MPerWave,
-        32,                       // NPerWave,
-        4,                        // K1,
-        2,                        // MRepeat,
-        2,                        // NRepeat,
-        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
-        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
-        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
-        2,                        // ABlockTransferSrcVectorDim
-        4,                        // ABlockTransferSrcScalarPerVector,
-        4,                        // ABlockTransferDstScalarPerVector_K1,
-        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
-        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
-        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
-        {1, 0, 2},                // BBlockTransferThreadClusterArrangeOrder,
-        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
-        2,                        // BBlockTransferSrcVectorDim
-        4,                        // BBlockTransferSrcScalarPerVector
-        4,                        // BBlockTransferDstScalarPerVector_K1
-        false,                    // BThreadTransferSrcResetCoordinateAfterRun
-        {2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
-        7,                        // CThreadTransferSrcDstVectorDim,
-        1                         // CThreadTransferDstScalarPerVector
-};
-#endif
--- a/host/solver/include/convolution_problem_descriptor.hpp
+++ b/host/solver/include/convolution_problem_descriptor.hpp
-#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
-#define CONVOLUTION_PROBLEM_DESCRIPTOR
-
-namespace ck {
-namespace driver {
-
-struct ConvolutionProblemDescriptor
-{
-    ConvolutionProblemDescriptor() = default;
-
-    ConvolutionProblemDescriptor(int N_,
-                                 int K_,
-                                 int C_,
-                                 int Y_,
-                                 int X_,
-                                 int Hi_,
-                                 int Wi_,
-                                 int Ho_,
-                                 int Wo_,
-                                 int ConvStrideH_,
-                                 int ConvStrideW_,
-                                 int ConvDilationH_,
-                                 int ConvDilationW_,
-                                 int InLeftPadH_,
-                                 int InLeftPadW_,
-                                 int InRightPadH_,
-                                 int InRightPadW_,
-                                 ck::DataTypeEnum_t InDataTypeEnum_,
-                                 ck::DataTypeEnum_t WeiDataTypeEnum_,
-                                 ck::DataTypeEnum_t OutDataTypeEnum_)
-        : N{N_},
-          K{K_},
-          C{C_},
-          Y{Y_},
-          X{X_},
-          Hi{Hi_},
-          Wi{Wi_},
-          Ho{Ho_},
-          Wo{Wo_},
-          ConvStrideH{ConvStrideH_},
-          ConvStrideW{ConvStrideW_},
-          ConvDilationH{ConvDilationH_},
-          ConvDilationW{ConvDilationW_},
-          InLeftPadH{InLeftPadH_},
-          InLeftPadW{InLeftPadW_},
-          InRightPadH{InRightPadH_},
-          InRightPadW{InRightPadW_},
-          InDataTypeEnum{InDataTypeEnum_},
-          WeiDataTypeEnum{WeiDataTypeEnum_},
-          OutDataTypeEnum{OutDataTypeEnum_}
-    {
-    }
-
-    int N;
-    int K;
-    int C;
-    int Y;
-    int X;
-    int Hi;
-    int Wi;
-    int Ho;
-    int Wo;
-    int ConvStrideH;
-    int ConvStrideW;
-    int ConvDilationH;
-    int ConvDilationW;
-    int InLeftPadH;
-    int InLeftPadW;
-    int InRightPadH;
-    int InRightPadW;
-
-    ck::DataTypeEnum_t InDataTypeEnum;
-    ck::DataTypeEnum_t WeiDataTypeEnum;
-    ck::DataTypeEnum_t OutDataTypeEnum;
-
-    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
-};
-
-} // namespace driver
-} // namespace ck
-#endif
--- a/host/solver/include/solver_common.hpp
+++ b/host/solver/include/solver_common.hpp
-#ifndef CK_SOLVER_COMMON_HPP
-#define CK_SOLVER_COMMON_HPP
-
-namespace ck {
-namespace driver {
-
-// greatest common divisor, aka highest common factor
-inline int gcd(int x, int y)
-{
-    if(x < 0)
-    {
-        return gcd(-x, y);
-    }
-    else if(y < 0)
-    {
-        return gcd(x, -y);
-    }
-    else if(x == y || x == 0)
-    {
-        return y;
-    }
-    else if(y == 0)
-    {
-        return x;
-    }
-    else if(x > y)
-    {
-        return gcd(x % y, y);
-    }
-    else
-    {
-        return gcd(x, y % x);
-    }
-}
-
-template <typename X,
-          typename... Ys,
-          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-auto gcd(X x, Ys... ys)
-{
-    return gcd(x, gcd(ys...));
-}
-
-} // namespace driver
-} // namespace ck
-#endif
--- a/composable_kernel/include/config.hpp
+++ b/composable_kernel/include/config.hpp
@@ -151,6 +151,12 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1
 #endif

+// workaround for verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
+// tuning parameter
+#ifndef CK_WORKAROUND_SWDEV_325164
+#define CK_WORKAROUND_SWDEV_325164 1
+#endif
+
 namespace ck {

 enum InMemoryDataOperationEnum_t

--- a/composable_kernel/include/hip_version.hpp.in
+++ b/composable_kernel/include/hip_version.hpp.in
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
--- a/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp