remove online compilation from CK

ae98b52a · Chao Liu · cb954213 · ae98b52a · cb954213 · cb954213
Commit ae98b52a authored Aug 07, 2021 by Chao Liu
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 2.8.3)
-project(modular_convolution)
+project(composable_kernel)
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-include(TargetFlags)
-include(AddKernels)
 ## C++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)

--- a/cmake/AddKernels.cmake
+++ b/cmake/AddKernels.cmake
-function(add_kernels SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    set(KERNELS_DECLS)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-	string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
-	string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
-	list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
-endfunction()
-function(add_kernel_includes SRC_DIR KERNEL_FILES)
-    set(INIT_KERNELS_LIST)
-    foreach(KERNEL_FILE ${KERNEL_FILES})
-        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
-            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
-        endif()
-        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
-        get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
-        string(TOUPPER "${BASE_NAME}" KEY_NAME)
-        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
-        list(APPEND INIT_KERNELS_LIST "    { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
-    endforeach()
-    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
-    configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
-endfunction()
--- a/cmake/TargetFlags.cmake
+++ b/cmake/TargetFlags.cmake
-function(get_target_property2 VAR TARGET PROPERTY)
-    get_target_property(_pflags ${TARGET} ${PROPERTY})
-    if(_pflags)
-        set(${VAR} ${_pflags} PARENT_SCOPE)
-    else()
-        set(${VAR} "" PARENT_SCOPE)
-    endif()
-endfunction()
-macro(append_flags FLAGS TARGET PROPERTY PREFIX)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        else()
-            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-macro(append_link_flags FLAGS TARGET PROPERTY)
-    get_target_property2(_pflags ${TARGET} ${PROPERTY})
-    foreach(FLAG ${_pflags})
-        if(TARGET ${FLAG})
-            target_flags(_pflags2 ${FLAG})
-            string(APPEND ${FLAGS} " ${_pflags2}")
-        elseif(FLAG MATCHES "^-.*")
-            string(APPEND ${FLAGS} " ${FLAG}")
-        elseif(EXISTS ${FLAG})
-            string(APPEND ${FLAGS} " ${FLAG}")
-        else()
-            string(APPEND ${FLAGS} " -l${FLAG}")
-        endif()
-    endforeach()
-endmacro()
-function(target_flags FLAGS TARGET)
-    set(_flags)
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
-    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
-    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
-    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
-    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
-    # message("_flags: ${_flags}")
-    set(${FLAGS} ${_flags} PARENT_SCOPE)
-endfunction()
--- a/host/CMakeLists.txt
+++ b/host/CMakeLists.txt
 add_subdirectory(host_tensor)
-add_subdirectory(online_compile)
 add_subdirectory(driver_offline)
-add_subdirectory(driver_online)
--- a/host/driver_online/CMakeLists.txt
+++ b/host/driver_online/CMakeLists.txt
-include_directories(BEFORE
-    include
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/solver/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
-)
-set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
-add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
-target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
--- a/host/driver_online/conv_fwd_driver_online.cpp
+++ b/host/driver_online/conv_fwd_driver_online.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_tensor.hpp"
-#include "handle.hpp"
-#include "hipCheck.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V6R1_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
-enum ConvForwardAlgo
-{
-    V4R4NCHW,    // 0
-    V6R1NCHW,    // 1
-    V4R4XDLNCHW, // 2
-    V4R4XDLNHWC  // 3
-};
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using size_t = std::size_t;
-    hipStream_t stream;
-    online_compile::Handle* handle;
-    MY_HIP_CHECK(hipStreamCreate(&stream));
-    handle = new online_compile::Handle(stream);
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    if(argc != 22)
-    {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#if 1
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 0
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    switch(layout)
-    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(C);
-        in_lengths_host[2] = static_cast<std::size_t>(Hi);
-        in_lengths_host[3] = static_cast<std::size_t>(Wi);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(Hi);
-        in_lengths_host[2] = static_cast<std::size_t>(Wi);
-        in_lengths_host[3] = static_cast<std::size_t>(C);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
-    }
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = std::thread::hardware_concurrency();
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
-    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
-    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
-    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
-#if USE_CONV_FWD_V4R4_NCHW
-    if(algo == ConvForwardAlgo::V4R4NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V6R1_NCHW
-    if(algo == ConvForwardAlgo::V6R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-#if 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            1,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 1},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 1},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 0
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            2,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 2},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 2},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            4,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 4},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 4},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#endif
-        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        compile_param,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
-    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
-    if(algo == ConvForwardAlgo::V4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_direct_convolution(in,
-                                wei,
-                                out_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
-        check_error(out_host, out_device);
-#if 0
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-#endif
-    }
-    delete handle;
-    MY_HIP_CHECK(hipStreamDestroy(stream));
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
-           std::to_string(pt->KPerThread) + "_";
-    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN11) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
-           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
-           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
-    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
-           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
-           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
-           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        conv_strides,
-                                                                        conv_dilations,
-                                                                        in_left_pads,
-                                                                        in_right_pads);
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto c_m_n_grid_desc = descs[I2];
-    const auto M               = c_m_n_grid_desc.GetLength(I0);
-    const auto N               = c_m_n_grid_desc.GetLength(I1);
-    const auto K               = a_k_m_grid_desc.GetLength(I0);
-    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
-             get_definition_string_from_tunable(tunable) +
-             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
-             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable) + "_" +
-                     std::to_string(hasMainKBlockLoop) + "_" +
-                     std::to_string(hasDoubleTailKBlockLoop);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-    const auto n  = in_n_c_hi_wi_desc.GetLength(I0);
-    const auto c  = in_n_c_hi_wi_desc.GetLength(I1);
-    const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
-    const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
-    const auto k  = wei_k_c_y_x_desc.GetLength(I0);
-    const auto y  = wei_k_c_y_x_desc.GetLength(I2);
-    const auto x  = wei_k_c_y_x_desc.GetLength(I3);
-    const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
-    const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
-             get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-    online_compile::Handle* handle,
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
-    const auto n  = in_n_hi_wi_c_desc.GetLength(I0);
-    const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
-    const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
-    const auto c  = in_n_hi_wi_c_desc.GetLength(I3);
-    const auto k = wei_k_y_x_c_desc.GetLength(I0);
-    const auto y = wei_k_y_x_c_desc.GetLength(I1);
-    const auto x = wei_k_y_x_c_desc.GetLength(I2);
-    const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
-    const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-    in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k0_n_k1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m1_m2_n_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
-    param += get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k0_m_k1_grid_desc_dev_buf,
-            b_k0_n_k1_grid_desc_dev_buf,
-            c_m0_m1_m2_n_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k0_m_k1_grid_desc_dev_buf),
-            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
-            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_hi_wi_c_lengths[I0];
-        const auto C = in_n_hi_wi_c_lengths[I3];
-        const auto Ho = out_n_ho_wo_k_lengths[I1];
-        const auto Wo = out_n_ho_wo_k_lengths[I2];
-        const auto K  = out_n_ho_wo_k_lengths[I3];
-        const auto Y = wei_k_y_x_c_lengths[I1];
-        const auto X = wei_k_y_x_c_lengths[I2];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time2;
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "convolution_problem_descriptor.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck::driver;
-    using size_t = std::size_t;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
-                                                   out_n_k_ho_wo_lengths[I1],
-                                                   in_n_c_hi_wi_lengths[I1],
-                                                   wei_k_c_y_x_lengths[I2],
-                                                   wei_k_c_y_x_lengths[I3],
-                                                   in_n_c_hi_wi_lengths[I2],
-                                                   in_n_c_hi_wi_lengths[I3],
-                                                   out_n_k_ho_wo_lengths[I2],
-                                                   out_n_k_ho_wo_lengths[I3],
-                                                   conv_strides[I0],
-                                                   conv_strides[I1],
-                                                   conv_dilations[I0],
-                                                   conv_dilations[I1],
-                                                   in_left_pads[I0],
-                                                   in_left_pads[I1],
-                                                   in_right_pads[I0],
-                                                   in_right_pads[I1],
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TOut>::value};
-    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
-                                                                   compile_param))
-    {
-        throw std::runtime_error("wrong! IsValidCompileParameter fail");
-    }
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // workspace is used for save transformed tensor descritpors created by prepare kernel
-    DeviceMem workspace_dev_buf(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
-    const auto block_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
-    const auto grid_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
-    const std::vector<size_t> vld1 = {1, 1, 1};
-    const std::vector<size_t> vgd1 = {1, 1, 1};
-    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
-    std::string compile_param_string =
-        get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
-    std::string network_config = compile_param_string;
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat + 1; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name,
-                          network_config_1,
-                          program_name,
-                          kernel_name,
-                          vld1,
-                          vgd1,
-                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-                                                conv_strides[I0],
-                                                conv_strides[I1],
-                                                conv_dilations[I0],
-                                                conv_dilations[I1],
-                                                in_left_pads[I0],
-                                                in_left_pads[I1],
-                                                in_right_pads[I0],
-                                                in_right_pads[I1],
-                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name,
-                          network_config_2,
-                          program_name,
-                          kernel_name,
-                          vld2,
-                          vgd2,
-                          compile_param_string)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        float perf = (float)(conv_problem_desc.CalculateFlop()) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/online_compile/CMakeLists.txt
+++ b/host/online_compile/CMakeLists.txt
-set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
-## for online-compiling of HIP kernels
-set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
-## reset to avoid the C++ options from the parent project
-set(CMAKE_CXX_FLAGS "")
-message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
-# look for and register clang-offload-bundler
-if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
-        PATH_SUFFIXES bin
-        PATHS
-	    /opt/rocm/llvm
-	    ${CMAKE_INSTALL_PREFIX}/llvm
-    )
-endif()
-if(OLC_OFFLOADBUNDLER_BIN)
-    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
-    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
-else()
-    # look for and register extractkernel
-    message(STATUS "clang-offload-bundler not found")
-    find_program(EXTRACTKERNEL_BIN extractkernel
-        PATH_SUFFIXES bin
-        PATHS
-            /opt/rocm/hip
-            /opt/rocm/hcc
-            /opt/rocm
-	    ${CMAKE_INSTALL_PREFIX}/hip
-            ${CMAKE_INSTALL_PREFIX}/hcc
-            ${CMAKE_INSTALL_PREFIX}
-    )
-    if(EXTRACTKERNEL_BIN)
-        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
-        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
-    else()
-        message(FATAL_ERROR "extractkernel not found")
-    endif()
-endif()
-option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
-set(BOOST_COMPONENTS filesystem)
-add_definitions(-DBOOST_ALL_NO_LIB=1)
-find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
-# HIP is always required
-find_package(hip REQUIRED PATHS /opt/rocm)
-message(STATUS "Build with HIP ${hip_VERSION}")
-target_flags(HIP_COMPILER_FLAGS hip::device)
-# Remove cuda arch flags
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
-set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
-set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
-option(ENABLE_DEBUG "Build to enable debugging" ON)
-if(ENABLE_DEBUG)
-    set(OLC_DEBUG 1)
-else()
-    set(OLC_DEBUG 0)
-endif()
-configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-)
-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
-set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
-add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
-file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
-file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
-set(MCONV_KERNEL_INCLUDES
-    ${COMPOSABLE_KERNEL_INCLUDE_1}
-    ${COMPOSABLE_KERNEL_INCLUDE_2}
-   )
-file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
-add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
-add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
-set(ONLINE_COMPILATION_SOURCE
-     ${PROJECT_BINARY_DIR}/kernel.cpp
-     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
-)
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    include
-)
-set(OLC_HIP_UTILITY_CPPS
-    hip_utility/logger.cpp
-    hip_utility/tmp_dir.cpp
-    hip_utility/md5.cpp  
-    hip_utility/exec_utils.cpp
-    hip_utility/target_properties.cpp  
-    hip_utility/handlehip.cpp
-    hip_utility/kernel_build_params.cpp  
-    hip_utility/hip_build_utils.cpp  
-    hip_utility/hipoc_program.cpp  
-    hip_utility/hipoc_kernel.cpp  
-    hip_utility/kernel_cache.cpp  
-    hip_utility/binary_cache.cpp
-   )
-list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
-## addkernels provide the tool to create inlined kernels in one header
-add_subdirectory(addkernels)
-function(inline_kernels_src KERNELS KERNEL_INCLUDES)
-    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
-    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
-    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
-    add_custom_command(
-        OUTPUT ${KERNEL_SRC_HPP_PATH}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
-        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
-	COMMENT "Inlining All kernels"
-    )
-    configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
-    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
-    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
-endfunction()
-inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
-list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
-add_custom_command(
-    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
-    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
-    COMMENT "Inlining HIP kernel includes"
-  )
-## the library target
-add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) 
-target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
-target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
-target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
-target_link_libraries(online_compile PRIVATE hip::device)
-target_link_libraries(online_compile INTERFACE hip::host)
-target_link_libraries(online_compile PRIVATE Boost::filesystem)
-target_compile_features(online_compile PUBLIC)
-set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS online_compile LIBRARY DESTINATION lib) 
--- a/host/online_compile/addkernels/CMakeLists.txt
+++ b/host/online_compile/addkernels/CMakeLists.txt
-################################################################################
-# 
-# MIT License
-# 
-# Copyright (c) 2017 Advanced Micro Devices, Inc.
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-# 
-################################################################################
-set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
-add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
--- a/host/online_compile/addkernels/addkernels.cpp
+++ b/host/online_compile/addkernels/addkernels.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include_inliner.hpp"
-#include <algorithm>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-void Bin2Hex(std::istream& source,
-             std::ostream& target,
-             const std::string& variable,
-             bool nullTerminate,
-             size_t bufferSize,
-             size_t lineSize)
-{
-    source.seekg(0, std::ios::end);
-    std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
-    std::streamoff sourceSize = source.tellg();
-    std::streamoff blockStart = 0;
-    if(variable.length() != 0)
-    {
-        target << "extern const size_t " << variable << "_SIZE;" << std::endl;
-        target << "extern const unsigned char " << variable << "[];" << std::endl;
-        target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
-               << std::endl;
-        target << "const unsigned char " << variable << "[] = {" << std::endl;
-    }
-    target << std::setbase(16) << std::setfill('0');
-    source.seekg(0, std::ios::beg);
-    while(blockStart < sourceSize)
-    {
-        source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
-        std::streamoff pos       = source.tellg();
-        std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
-        std::streamoff i         = 0;
-        while(i < blockSize)
-        {
-            size_t j   = i;
-            size_t end = std::min<size_t>(i + lineSize, blockSize);
-            for(; j < end; j++)
-                target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
-            target << std::endl;
-            i = end;
-        }
-        blockStart += blockSize;
-    }
-    if(nullTerminate)
-        target << "0x00," << std::endl;
-    if(variable.length() != 0)
-    {
-        target << "};" << std::endl;
-    }
-}
-void PrintHelp()
-{
-    std::cout << "Usage: bin2hex {<option>}" << std::endl;
-    std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Options:" << std::endl;
-    std::cout
-        << "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
-        << std::endl;
-    std::cout << "           -t[arget] <path>: target file. Default: std out." << std::endl;
-    std::cout << "           -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
-    std::cout << "           -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
-    std::cout << "           -g[uard] <string>: guard name. Default: no guard" << std::endl;
-    std::cout << "           -n[o-recurse] : dont expand include files recursively. Default: off"
-              << std::endl;
-}
-[[gnu::noreturn]] void WrongUsage(const std::string& error)
-{
-    std::cout << "Wrong usage: " << error << std::endl;
-    std::cout << std::endl;
-    PrintHelp();
-    std::exit(1);
-}
-[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
-{
-    std::ostringstream ss;
-    ss << "unknown argument - " << arg;
-    WrongUsage(ss.str());
-}
-void Process(const std::string& sourcePath,
-             std::ostream& target,
-             size_t bufferSize,
-             size_t lineSize,
-             bool recurse,
-             bool as_extern)
-{
-    std::string fileName(sourcePath);
-    std::string extension, root;
-    std::stringstream inlinerTemp;
-    auto extPos   = fileName.rfind('.');
-    auto slashPos = fileName.rfind('/');
-    if(extPos != std::string::npos)
-    {
-        extension = fileName.substr(extPos + 1);
-        fileName  = fileName.substr(0, extPos);
-    }
-    if(slashPos != std::string::npos)
-    {
-        root     = fileName.substr(0, slashPos + 1);
-        fileName = fileName.substr(slashPos + 1);
-    }
-    std::string variable(fileName);
-    std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
-    std::istream* source = &sourceFile;
-    if(!sourceFile.good())
-    {
-        std::cerr << "File not found: " << sourcePath << std::endl;
-        std::exit(1);
-    }
-    const auto is_asm    = extension == "s";
-    const auto is_cl     = extension == "cl";
-    const auto is_hip    = extension == "cpp";
-    const auto is_header = extension == "hpp";
-    if(is_asm || is_cl || is_hip || is_header)
-    {
-        IncludeInliner inliner;
-        try
-        {
-            if(is_asm)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
-            else if(is_cl || is_header)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
-            else if(is_hip)
-                inliner.Process(
-                    sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
-        }
-        catch(const InlineException& ex)
-        {
-            std::cerr << ex.What() << std::endl;
-            std::cerr << ex.GetTrace() << std::endl;
-            std::exit(1);
-        }
-        source = &inlinerTemp;
-    }
-    std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
-    if(as_extern && variable.length() != 0)
-    {
-        variable = "APP_KERNEL_" + variable;
-    }
-    Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
-}
-int main(int argsn, char** args)
-{
-    if(argsn == 1)
-    {
-        PrintHelp();
-        return 2;
-    }
-    std::string guard;
-    size_t bufferSize = 512;
-    size_t lineSize   = 16;
-    std::ofstream targetFile;
-    std::ostream* target = &std::cout;
-    bool recurse         = true;
-    bool as_extern       = false;
-    int i = 0;
-    while(++i < argsn && **args != '-')
-    {
-        std::string arg(args[i] + 1);
-        std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
-        if(arg == "s" || arg == "source")
-        {
-            if(guard.length() > 0)
-            {
-                *target << "#ifndef " << guard << std::endl;
-                *target << "#define " << guard << std::endl;
-            }
-            *target << "#include <cstddef>" << std::endl;
-            while(++i < argsn)
-            {
-                Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
-            }
-            if(guard.length() > 0)
-            {
-                *target << "#endif" << std::endl;
-            }
-            return 0;
-        }
-        else if(arg == "t" || arg == "target")
-        {
-            targetFile.open(args[++i], std::ios::out);
-            target = &targetFile;
-        }
-        else if(arg == "l" || arg == "line-size")
-            lineSize = std::stol(args[++i]);
-        else if(arg == "b" || arg == "buffer")
-            bufferSize = std::stol(args[++i]);
-        else if(arg == "g" || arg == "guard")
-            guard = args[++i];
-        else if(arg == "n" || arg == "no-recurse")
-            recurse = false;
-        else if(arg == "e" || arg == "extern")
-            as_extern = true;
-        else
-            UnknownArgument(arg);
-    }
-    WrongUsage("source key is required");
-}
--- a/host/online_compile/addkernels/include_inliner.cpp
+++ b/host/online_compile/addkernels/include_inliner.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <algorithm>
-#include <exception>
-#include <fstream>
-#include <sstream>
-#ifdef _WIN32
-#include <windows.h>
-#endif
-#ifdef __linux__
-#include <linux/limits.h>
-#include <cstdlib>
-#endif // !WIN32
-#include "include_inliner.hpp"
-namespace PathHelpers {
-static int GetMaxPath()
-{
-#ifdef _WIN32
-    return MAX_PATH;
-#else
-    return PATH_MAX;
-#endif
-}
-static std::string GetAbsolutePath(const std::string& path)
-{
-    std::string result(GetMaxPath(), ' ');
-#ifdef _WIN32
-    const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
-    if(retval == 0)
-        return "";
-#else
-    auto* const retval = realpath(path.c_str(), &result[0]);
-    if(retval == nullptr)
-        return "";
-#endif
-    return result;
-}
-} // namespace PathHelpers
-std::string IncludeFileExceptionBase::What() const
-{
-    std::ostringstream ss;
-    ss << GetMessage() << ": <" << _file << ">";
-    return ss.str();
-}
-void IncludeInliner::Process(std::istream& input,
-                             std::ostream& output,
-                             const std::string& root,
-                             const std::string& file_name,
-                             const std::string& directive,
-                             bool allow_angle_brackets,
-                             bool recurse)
-{
-    ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
-}
-void IncludeInliner::ProcessCore(std::istream& input,
-                                 std::ostream& output,
-                                 const std::string& root,
-                                 const std::string& file_name,
-                                 int line_number,
-                                 const std::string& directive,
-                                 bool allow_angle_brackets,
-                                 bool recurse)
-{
-    if(_include_depth >= include_depth_limit)
-        throw InlineStackOverflowException(GetIncludeStackTrace(0));
-    _include_depth++;
-    _included_stack_head =
-        std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
-    auto current_line          = 0;
-    auto next_include_optional = false;
-    while(!input.eof())
-    {
-        std::string line;
-        std::string word;
-        std::getline(input, line);
-        std::istringstream line_parser(line);
-        line_parser >> word;
-        current_line++;
-        std::transform(word.begin(), word.end(), word.begin(), ::tolower);
-        const auto include_optional = next_include_optional;
-        next_include_optional       = false;
-        if(!word.empty() && word == "//inliner-include-optional")
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-            next_include_optional = true;
-            continue;
-        }
-        if(!word.empty() && word == directive && recurse)
-        {
-            auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
-            std::string::size_type second_quote_pos;
-            if(first_quote_pos != std::string::npos)
-            {
-                second_quote_pos = line.find('"', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-            else
-            {
-                if(!allow_angle_brackets)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-                first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
-                if(first_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-                second_quote_pos = line.find('>', first_quote_pos + 1);
-                if(second_quote_pos == std::string::npos)
-                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
-            }
-            const std::string include_file_path =
-                line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
-            const std::string abs_include_file_path(
-                PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
-            if(abs_include_file_path.empty())
-            {
-                if(include_optional)
-                    continue;
-                throw IncludeNotFoundException(include_file_path,
-                                               GetIncludeStackTrace(current_line));
-            }
-            std::ifstream include_file(abs_include_file_path, std::ios::in);
-            if(!include_file.good())
-                throw IncludeCantBeOpenedException(include_file_path,
-                                                   GetIncludeStackTrace(current_line));
-            ProcessCore(include_file,
-                        output,
-                        root,
-                        include_file_path,
-                        current_line,
-                        directive,
-                        allow_angle_brackets,
-                        recurse);
-        }
-        else
-        {
-            if(include_optional)
-                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
-            if(output.tellp() > 0)
-                output << std::endl;
-            output << line;
-        }
-    }
-    auto prev_file       = _included_stack_head->included_from;
-    _included_stack_head = prev_file;
-    _include_depth--;
-}
-std::string IncludeInliner::GetIncludeStackTrace(int line)
-{
-    std::ostringstream ss;
-    if(_included_stack_head == nullptr)
-        return "";
-    auto item = _included_stack_head;
-    ss << "    " << item->path << ":" << line;
-    while(item->included_from != nullptr)
-    {
-        ss << std::endl << "    from " << item->included_from->path << ":" << item->included_line;
-        item = item->included_from;
-    }
-    return ss.str();
-}
--- a/host/online_compile/addkernels/include_inliner.hpp
+++ b/host/online_compile/addkernels/include_inliner.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_INLINER_HPP
-#define SOURCE_INLINER_HPP
-#include "source_file_desc.hpp"
-#include <ostream>
-#include <memory>
-#include <stack>
-class InlineException : public std::exception
-{
-    public:
-    InlineException(const std::string& trace) : _trace(trace) {}
-    virtual std::string What() const = 0;
-    const std::string& GetTrace() const { return _trace; }
-    private:
-    std::string _trace;
-};
-class InlineStackOverflowException : public InlineException
-{
-    public:
-    InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
-    std::string What() const override
-    {
-        return "Include stack depth limit has been reached, possible circle includes";
-    }
-};
-class IncludeExpectedException : public InlineException
-{
-    public:
-    IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
-    std::string What() const override { return "Include directive expected"; }
-};
-class WrongInlineDirectiveException : public InlineException
-{
-    public:
-    WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
-    std::string What() const override { return "Include directive has wrong format"; }
-};
-class IncludeFileExceptionBase : public InlineException
-{
-    public:
-    IncludeFileExceptionBase(const std::string& file, const std::string& trace)
-        : InlineException(trace), _file(file)
-    {
-    }
-    std::string What() const override;
-    virtual std::string GetMessage() const = 0;
-    private:
-    std::string _file;
-};
-class IncludeNotFoundException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeNotFoundException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-    std::string GetMessage() const override
-    {
-        return "Include file not found (if it is optional put //inliner-include-optional on line "
-               "before it)";
-    }
-};
-class IncludeCantBeOpenedException : public IncludeFileExceptionBase
-{
-    public:
-    IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
-        : IncludeFileExceptionBase(file, trace)
-    {
-    }
-    std::string GetMessage() const override { return "Can not open include file"; }
-};
-class IncludeInliner
-{
-    public:
-    int include_depth_limit = 256;
-    void Process(std::istream& input,
-                 std::ostream& output,
-                 const std::string& root,
-                 const std::string& file_name,
-                 const std::string& directive,
-                 bool allow_angle_brackets,
-                 bool recurse);
-    std::string GetIncludeStackTrace(int line);
-    private:
-    int _include_depth                                   = 0;
-    std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
-    void ProcessCore(std::istream& input,
-                     std::ostream& output,
-                     const std::string& root,
-                     const std::string& file_name,
-                     int line_number,
-                     const std::string& directive,
-                     bool allow_angle_brackets,
-                     bool recurse);
-};
-#endif // !SOURCE_INLINER_HPP
--- a/host/online_compile/addkernels/source_file_desc.hpp
+++ b/host/online_compile/addkernels/source_file_desc.hpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef SOURCE_FILE_DESC_HPP
-#define SOURCE_FILE_DESC_HPP
-#include <string>
-#include <memory>
-class SourceFileDesc
-{
-    public:
-    const std::string path;
-    int included_line;
-    std::shared_ptr<SourceFileDesc> included_from;
-    SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
-        : path(path_), included_line(line), included_from(from)
-    {
-    }
-};
-#endif // SOURCE_FILE_DESC_HPP
--- a/host/online_compile/hip_utility/binary_cache.cpp
+++ b/host/online_compile/hip_utility/binary_cache.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <binary_cache.hpp>
-#include <handle.hpp>
-#include <md5.hpp>
-#include <env.hpp>
-#include <stringutils.hpp>
-#include <logger.hpp>
-#include <target_properties.hpp>
-#include <boost/filesystem.hpp>
-#include <fstream>
-#include <iostream>
-namespace online_compile {
-OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
-OLC_DECLARE_ENV_VAR(HOME)
-static boost::filesystem::path ComputeCachePath()
-{
-    const char* home_dir = GetStringEnv(HOME{});
-    if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
-    {
-        home_dir = "/tmp";
-    }
-    auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
-    if(!boost::filesystem::exists(p))
-        boost::filesystem::create_directories(p);
-    return p;
-}
-boost::filesystem::path GetCachePath()
-{
-    static const boost::filesystem::path user_path = ComputeCachePath();
-    return user_path;
-}
-static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); }
-boost::filesystem::path
-GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
-{
-    // std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o";
-    std::string filename = name + ".o";
-    return GetCachePath() / online_compile::md5(device + ":" + args) / filename;
-}
-boost::filesystem::path LoadBinary(const TargetProperties& target,
-                                   const size_t num_cu,
-                                   const std::string& name,
-                                   const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-        return {};
-    (void)num_cu;
-    auto f = GetCacheFile(target.DbId(), name, args);
-    if(boost::filesystem::exists(f))
-    {
-        return f.string();
-    }
-    else
-    {
-        return {};
-    }
-}
-void SaveBinary(const boost::filesystem::path& binary_path,
-                const TargetProperties& target,
-                const std::string& name,
-                const std::string& args)
-{
-    if(online_compile::IsCacheDisabled())
-    {
-        boost::filesystem::remove(binary_path);
-    }
-    else
-    {
-        auto p = GetCacheFile(target.DbId(), name, args);
-        boost::filesystem::create_directories(p.parent_path());
-        boost::filesystem::rename(binary_path, p);
-    }
-}
-} // namespace online_compile
--- a/host/online_compile/hip_utility/exec_utils.cpp
+++ b/host/online_compile/hip_utility/exec_utils.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <exec_utils.hpp>
-#include <manage_ptr.hpp>
-#include <istream>
-#include <ostream>
-#include <string>
-#include <cstdio>
-#include <array>
-#include <cassert>
-#ifdef __linux__
-#include <unistd.h>
-#include <cstdio>
-#include <sys/wait.h>
-#endif // __linux__
-namespace online_compile {
-namespace exec {
-int Run(const std::string& p, std::istream* in, std::ostream* out)
-{
-#ifdef __linux__
-    const auto redirect_stdin  = (in != nullptr);
-    const auto redirect_stdout = (out != nullptr);
-    assert(!(redirect_stdin && redirect_stdout));
-    const auto file_mode = redirect_stdout ? "r" : "w";
-    OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
-    if(!pipe)
-        throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode +
-                                 ") failed");
-    if(redirect_stdin || redirect_stdout)
-    {
-        std::array<char, 1024> buffer{};
-        if(redirect_stdout)
-        {
-            while(feof(pipe.get()) == 0)
-                if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
-                    *out << buffer.data();
-        }
-        else
-        {
-            while(!in->eof())
-            {
-                in->read(buffer.data(), buffer.size() - 1);
-                buffer[in->gcount()] = 0;
-                if(fputs(buffer.data(), pipe.get()) == EOF)
-                    throw std::runtime_error("online_compile::exec::Run(): fputs() failed");
-            }
-        }
-    }
-    auto status = pclose(pipe.release());
-    return WEXITSTATUS(status);
-#else
-    (void)p;
-    (void)in;
-    (void)out;
-    return -1;
-#endif // __linux__
-}
-} // namespace exec
-} // namespace online_compile
--- a/host/online_compile/hip_utility/handlehip.cpp
+++ b/host/online_compile/hip_utility/handlehip.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <handle.hpp>
-#include <binary_cache.hpp>
-#include <env.hpp>
-#include <kernel_cache.hpp>
-#include <stringutils.hpp>
-#include <target_properties.hpp>
-#include <hipCheck.hpp>
-#include <write_file.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/lexical_cast.hpp>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <thread>
-OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
-namespace online_compile {
-std::size_t GetAvailableMemory()
-{
-    size_t free, total;
-    MY_HIP_CHECK(hipMemGetInfo(&free, &total));
-    return free;
-}
-int get_device_id() // Get random device
-{
-    int device;
-    MY_HIP_CHECK(hipGetDevice(&device));
-    return device;
-}
-void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
-int set_default_device()
-{
-    int n;
-    MY_HIP_CHECK(hipGetDeviceCount(&n));
-    // Pick device based on process id
-    auto pid = ::getpid();
-    assert(pid > 0);
-    set_device(pid % n);
-    return (pid % n);
-}
-struct HandleImpl
-{
-    using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
-    HandleImpl() {}
-    StreamPtr create_stream()
-    {
-        hipStream_t result;
-        MY_HIP_CHECK(hipStreamCreate(&result));
-        return StreamPtr{result, &hipStreamDestroy};
-    }
-    static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
-    std::string get_device_name() const
-    {
-        hipDeviceProp_t props;
-        MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
-        const std::string name(props.gcnArchName);
-        return name;
-    }
-    StreamPtr stream = nullptr;
-    int device       = -1;
-    KernelCache cache;
-    TargetProperties target_properties;
-};
-Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-    if(stream == nullptr)
-        this->impl->stream = HandleImpl::reference_stream(nullptr);
-    else
-        this->impl->stream = HandleImpl::reference_stream(stream);
-    this->impl->target_properties.Init(this);
-}
-Handle::Handle() : impl(new HandleImpl())
-{
-    this->impl->device = get_device_id();
-    this->impl->stream = HandleImpl::reference_stream(nullptr);
-    this->impl->target_properties.Init(this);
-}
-Handle::~Handle() {}
-void Handle::SetStream(hipStream_t streamID) const
-{
-    this->impl->stream = HandleImpl::reference_stream(streamID);
-    this->impl->target_properties.Init(this);
-}
-hipStream_t Handle::GetStream() const { return impl->stream.get(); }
-KernelInvoke Handle::AddKernel(const std::string& algorithm,
-                               const std::string& network_config,
-                               const std::string& program_name,
-                               const std::string& kernel_name,
-                               const std::vector<size_t>& vld,
-                               const std::vector<size_t>& vgd,
-                               const std::string& params,
-                               std::size_t cache_index) const
-{
-    auto obj = this->impl->cache.AddKernel(
-        *this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
-    return this->Run(obj);
-}
-void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
-{
-    this->impl->cache.ClearKernels(algorithm, network_config);
-}
-const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
-                                                  const std::string& network_config) const
-{
-    return this->impl->cache.GetKernels(algorithm, network_config);
-}
-bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
-{
-    return this->impl->cache.HasKernels(algorithm, network_config);
-}
-KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
-Program Handle::LoadProgram(const std::string& program_name, std::string params) const
-{
-    if((!online_compile::EndsWith(program_name, ".mlir-cpp")) &&
-       (!online_compile::EndsWith(program_name, ".mlir")))
-    {
-        params += " -mcpu=" + this->GetTargetProperties().Name();
-    }
-    auto hsaco = online_compile::LoadBinary(
-        this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
-    if(hsaco.empty())
-    {
-        auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
-        auto path = online_compile::GetCachePath() / boost::filesystem::unique_path();
-        if(p.IsCodeObjectInMemory())
-            online_compile::WriteFile(p.GetCodeObjectBlob(), path);
-        else
-            boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
-        online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
-        return p;
-    }
-    else
-    {
-        return HIPOCProgram{program_name, hsaco};
-    }
-}
-bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
-{
-    return this->impl->cache.HasProgram(program_name, params);
-}
-void Handle::AddProgram(Program prog,
-                        const std::string& program_name,
-                        const std::string& params) const
-{
-    this->impl->cache.AddProgram(prog, program_name, params);
-}
-void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
-std::size_t Handle::GetLocalMemorySize() const
-{
-    int result;
-    MY_HIP_CHECK(hipDeviceGetAttribute(
-        &result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
-    return result;
-}
-std::size_t Handle::GetGlobalMemorySize() const
-{
-    size_t result;
-    MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
-    return result;
-}
-std::size_t Handle::GetMaxComputeUnits() const
-{
-    int result;
-    const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{});
-    if(num_cu != nullptr && strlen(num_cu) > 0)
-    {
-        return boost::lexical_cast<std::size_t>(num_cu);
-    }
-    MY_HIP_CHECK(
-        hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
-    return result;
-}
-std::size_t Handle::GetWavefrontWidth() const
-{
-    hipDeviceProp_t props{};
-    MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
-    auto result = static_cast<size_t>(props.warpSize);
-    return result;
-}
-std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
-std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
-const TargetProperties& Handle::GetTargetProperties() const
-{
-    return this->impl->target_properties;
-}
-std::ostream& Handle::Print(std::ostream& os) const
-{
-    os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
-    return os;
-}
-} // namespace online_compile
--- a/host/online_compile/hip_utility/hip_build_utils.cpp
+++ b/host/online_compile/hip_utility/hip_build_utils.cpp
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <hip_build_utils.hpp>
-#include <stringutils.hpp>
-#include <tmp_dir.hpp>
-#include <env.hpp>
-#include <target_properties.hpp>
-#include <write_file.hpp>
-#include <exec_utils.hpp>
-#include <logger.hpp>
-#include <config.h>
-#include <boost/optional.hpp>
-#include <sstream>
-#include <string>
-#include <stdexcept>
-#include <iostream>
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
-OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
-#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
-namespace online_compile {
-bool IsHccCompiler()
-{
-    static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
-    return isHcc;
-}
-bool IsHipClangCompiler()
-{
-    static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
-    return isClangXX;
-}
-namespace {
-inline bool ProduceCoV3()
-{
-    // Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
-    return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
-}
-/// Returns option for enabling/disabling CO v3 generation for the compiler
-/// that builds HIP kernels, depending on compiler version etc.
-inline const std::string& GetCoV3Option(const bool enable)
-{
-    /// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
-    /// It's unclear why... HCC included in ROCm 2.8 does not support it,
-    /// perhaps it suits for some older HCC?
-    ///
-    /// These options are Ok for ROCm 3.0:
-    static const std::string option_enable{"-mcode-object-v3"};
-    static const std::string no_option{};
-    if(enable)
-        return option_enable;
-    else
-        return no_option;
-}
-} // namespace
-static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
-                                            const std::string& filename,
-                                            std::string src,
-                                            std::string params,
-                                            const TargetProperties& target,
-                                            const bool testing_mode,
-                                            const bool sources_already_reside_on_filesystem)
-{
-#ifdef __linux__
-    // Write out the include files
-    // Let's assume includes are overkill for feature tests & optimize'em out.
-    if(!testing_mode)
-    {
-        auto inc_list = GetHipKernelIncList();
-        auto inc_path = tmp_dir->path;
-        boost::filesystem::create_directories(inc_path);
-        for(auto inc_file : inc_list)
-        {
-            auto inc_src = GetKernelInc(inc_file);
-            WriteFile(inc_src, inc_path / inc_file);
-        }
-    }
-    // Sources produced by MLIR-cpp already reside in tmp dir.
-    if(!sources_already_reside_on_filesystem)
-    {
-        src += "\nint main() {}\n";
-        WriteFile(src, tmp_dir->path / filename);
-    }
-    // cppcheck-suppress unreadVariable
-    const LcOptionTargetStrings lots(target);
-    auto env = std::string("");
-    if(IsHccCompiler())
-    {
-        params += " -amdgpu-target=" + target.Name();
-        params += " " + GetCoV3Option(ProduceCoV3());
-    }
-    else if(IsHipClangCompiler())
-    {
-        if(params.find("-std=") == std::string::npos)
-            params += " --std=c++11";
-        if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
-            params += " --cuda-gpu-arch=" + lots.device;
-        else
-            params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
-        params += " --cuda-device-only";
-        params += " -c";
-        params += " -O3 ";
-    }
-    params += " -Wno-unused-command-line-argument -I. ";
-    params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
-    if(IsHccCompiler())
-    {
-        env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
-        if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
-            env += " --amdgpu-spill-vgpr-to-agpr=0";
-        env += '\"';
-    }
-    else if(IsHipClangCompiler())
-    {
-        params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
-        params += " -mllvm -amdgpu-early-inline-all=true";
-        params += " -mllvm -amdgpu-function-calls=false";
-    }
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
-    {
-        params += " -v";
-    }
-    if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
-    {
-        if(IsHccCompiler())
-        {
-            params += " -gline-tables-only";
-            env += " KMDUMPISA=1";
-            env += " KMDUMPLLVM=1";
-        }
-        else if(IsHipClangCompiler())
-        {
-            params += " -gline-tables-only";
-            params += " -save-temps";
-        }
-    }
-    // hip version
-    params +=
-        std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
-    params += " ";
-    auto bin_file = tmp_dir->path / (filename + ".o");
-    // compile
-    const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
-    tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
-                     params + filename + " -o " + bin_file.string() + redirector);
-    if(!boost::filesystem::exists(bin_file))
-        throw std::runtime_error(filename + " failed to compile");
-#ifdef EXTRACTKERNEL_BIN
-    if(IsHccCompiler())
-    {
-        // call extract kernel
-        tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
-        auto hsaco =
-            std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
-                return (entry.path().extension() == ".hsaco");
-            });
-        if(hsaco == boost::filesystem::directory_iterator{})
-        {
-            fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
-                << hsaco->path().string() << std::endl;
-        }
-        return hsaco->path();
-    }
-#endif
-    return bin_file;
-#else
-    (void)filename;
-    (void)params;
-    throw std::runtimer_error("HIP kernels are only supported in Linux");
-#endif
-}
-boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
-                                 const std::string& filename,
-                                 std::string src,
-                                 std::string params,
-                                 const TargetProperties& target,
-                                 const bool sources_already_reside_on_filesystem)
-{
-    return HipBuildImpl(
-        tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
-}
-void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
-{
-    std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
-    std::ostringstream bin_file_strm;
-    bin_file_strm << bin_file_ptr.rdbuf();
-    buf = bin_file_strm.str();
-}
-static external_tool_version_t HipCompilerVersionImpl()
-{
-    external_tool_version_t version;
-    if(IsHccCompiler())
-    {
-        const std::string path(OLC_HIP_COMPILER);
-        const std::string mandatory_prefix("(based on HCC ");
-        do
-        {
-            if(path.empty() || !std::ifstream(path).good())
-                break;
-            std::stringstream out;
-            if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0)
-                break;
-            std::string line;
-            while(!out.eof())
-            {
-                std::getline(out, line);
-                fdt_log() << line;
-                auto begin = line.find(mandatory_prefix);
-                if(begin == std::string::npos)
-                    continue;
-                begin += mandatory_prefix.size();
-                int v3, v2, v1 = v2 = v3 = -1;
-                char c2, c1 = c2 = 'X';
-                std::istringstream iss(line.substr(begin));
-                iss >> v1 >> c1 >> v2 >> c2 >> v3;
-                if(!iss.fail() && v1 >= 0)
-                {
-                    version.major = v1;
-                    if(c1 == '.' && v2 >= 0)
-                    {
-                        version.minor = v2;
-                        if(c2 == '.' && v3 >= 0)
-                            version.patch = v3;
-                    }
-                }
-                break;
-            }
-        } while(false);
-    }
-    else
-    {
-#ifdef HIP_PACKAGE_VERSION_MAJOR
-        fdt_log(
-            LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
-        version.major = HIP_PACKAGE_VERSION_MAJOR;
-#ifdef HIP_PACKAGE_VERSION_MINOR
-        version.minor = HIP_PACKAGE_VERSION_MINOR;
-#else
-        version.minor = 0;
-#endif
-#ifdef HIP_PACKAGE_VERSION_PATCH
-        version.patch = HIP_PACKAGE_VERSION_PATCH;
-#else
-        version.patch = 0;
-#endif
-#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
-        fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
-        version.major = 3;
-        version.minor = 2;
-        version.patch = 0;
-#endif
-    }
-    fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
-    return version;
-}
-external_tool_version_t HipCompilerVersion()
-{
-    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
-    static auto once = HipCompilerVersionImpl();
-    return once;
-}
-bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    if(lhs.major > rhs.major)
-        return true;
-    else if(lhs.major == rhs.major)
-    {
-        if(lhs.minor > rhs.minor)
-            return true;
-        else if(lhs.minor == rhs.minor)
-            return (lhs.patch > rhs.patch);
-        else
-            return false;
-    }
-    else
-        return false;
-}
-bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return rhs > lhs;
-}
-bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs < rhs);
-}
-bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
-{
-    return !(lhs > rhs);
-}
-} // namespace online_compile