reorganize files to prepare for MIOpen integration (#51)

* change olc cmake * adding online compile to fwd-v4r5r2 * update scripts * remane fwd-v4r5r2 to fwd-v6r1 * clean up

reorganize files to prepare for MIOpen integration (#51)
* change olc cmake * adding online compile to fwd-v4r5r2 * update scripts * remane fwd-v4r5r2 to fwd-v6r1 * clean up
12649254 · Chao Liu · GitHub · fbdf4332 · 12649254 · 12649254
Unverified Commit 12649254 authored Jul 18, 2021 by Chao Liu Committed by GitHub Jul 18, 2021
20 changed files
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw.hpp
 #include <unistd.h>
 #include "device.hpp"
 #include "host_tensor.hpp"
-#include "transform_forward_convolution_into_gemm_v4r5r2_nchw_kcyx_nkhw.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
 #include "driver_dynamic_contraction_v1r2.hpp"

 template <typename TInWei,
@@ -14,7 +14,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
+void device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
    const InLengths& in_n_c_hi_wi_lengths,
    const WeiLengths& wei_k_c_y_x_lengths,
    const OutLengths& out_n_k_ho_wo_lengths,
@@ -43,11 +43,11 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());

-    const auto in_n_c_hi_wi_desc =
+    const auto in_desc_n_c_hi_wi =
        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
+    const auto wei_desc_k_c_y_x =
        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
+    const auto out_desc_n_k_ho_wo =
        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);

 #if 1
@@ -58,32 +58,32 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
    constexpr index_t GN0 = 4;
    constexpr index_t GK1 = 1;

-    constexpr index_t GemmGM1PerBlockGM11 = 128;
-    constexpr index_t GemmGN1PerBlockGN11 = 32;
-    constexpr index_t GemmKPerBlock       = 8;
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;

-    constexpr index_t GemmM1PerThreadM111 = 4;
-    constexpr index_t GemmN1PerThreadN111 = 4;
-    constexpr index_t GemmKPerThread      = 1;
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;

-    constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
-    constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
-    constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
-    constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBM100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBN100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBM101 = 2;
+    constexpr index_t BM10BN10ThreadClusterBN101 = 2;

-    using GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 1>;
-    using GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;

-    using GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
-    using GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 1>;

-    using GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 1>;
-    using GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 1>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;

-    using GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    using GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;

-    constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
 #elif 1
    // [8, 1, 128, 2] * [8, 4, 32, 2] = [1, 128, 4, 32] for fp16
    // cdata = 64, BlockSize = 256
@@ -92,48 +92,48 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
    constexpr index_t GN0 = 4;
    constexpr index_t GK1 = 2;

-    constexpr index_t GemmGM1PerBlockGM11 = 128;
-    constexpr index_t GemmGN1PerBlockGN11 = 32;
-    constexpr index_t GemmKPerBlock       = 8;
+    constexpr index_t GM1PerBlockGM11 = 128;
+    constexpr index_t GN1PerBlockGN11 = 32;
+    constexpr index_t GK0PerBlock     = 8;

-    constexpr index_t GemmM1PerThreadM111 = 4;
-    constexpr index_t GemmN1PerThreadN111 = 4;
-    constexpr index_t GemmKPerThread      = 1;
+    constexpr index_t BM1PerThreadBM11 = 4;
+    constexpr index_t BN1PerThreadBN11 = 4;
+    constexpr index_t BK0PerThread     = 1;

-    constexpr index_t GemmM11N11ThreadClusterM1101 = 2;
-    constexpr index_t GemmM11N11ThreadClusterN1101 = 2;
-    constexpr index_t GemmM11N11ThreadClusterM1100 = 8;
-    constexpr index_t GemmM11N11ThreadClusterN1100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBM100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBN100 = 8;
+    constexpr index_t BM10BN10ThreadClusterBM101 = 2;
+    constexpr index_t BM10BN10ThreadClusterBN101 = 2;

-    using GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 2>;
-    using GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;
+    using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1   = Sequence<4, 1, 1, 1, 2>;
+    using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<2, 1, 1, 128, 1>;

-    using GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
-    using GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;
+    using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<4, 1, 1, 1, 1>;
+    using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = Sequence<1, 1, 1, 1, 2>;

-    using GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 2>;
-    using GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;
+    using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1   = Sequence<1, 4, 1, 1, 2>;
+    using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<8, 1, 1, 32, 1>;

-    using GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
-    using GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;
+    using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 1>;
+    using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = Sequence<1, 1, 1, 1, 2>;

-    constexpr index_t GemmCThreadTransferDstScalarPerVector_BN1 = 1;
+    constexpr index_t CThreadTransferDstScalarPerVector_BN1 = 1;
 #endif

    const auto descs =
-        transform_forward_convolution_into_contraction_v4r5r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                                 in_n_c_hi_wi_desc,
-                                                                                 out_n_k_ho_wo_desc,
-                                                                                 conv_strides,
-                                                                                 conv_dilations,
-                                                                                 in_left_pads,
-                                                                                 in_right_pads,
-                                                                                 Number<GN0>{},
-                                                                                 Number<GK1>{});
-
-    const auto wei_gk0_gm0_gm1_gk1_grid_desc = descs[I0];
-    const auto in_gk0_gn0_gn1_gk1_grid_desc  = descs[I1];
-    const auto out_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_desc_k_c_y_x,
+                                                                               in_desc_n_c_hi_wi,
+                                                                               out_desc_n_k_ho_wo,
+                                                                               conv_strides,
+                                                                               conv_dilations,
+                                                                               in_left_pads,
+                                                                               in_right_pads,
+                                                                               Number<GN0>{},
+                                                                               Number<GK1>{});
+
+    const auto wei_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto in_grid_desc_gk0_gn0_gn1_gk1  = descs[I1];
+    const auto out_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];

    // HACK: hacks that control index calculation when iterating over A, B, C matrix
    constexpr auto wei_grid_iterator_hacks =
@@ -189,36 +189,36 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
            TAcc,
            TOut,
            InMemoryDataOperation::Set,
-            decltype(wei_gk0_gm0_gm1_gk1_grid_desc),
-            decltype(in_gk0_gn0_gn1_gk1_grid_desc),
-            decltype(out_gm0_gm1_gn0_gn1_grid_desc),
-            GemmGM1PerBlockGM11,
-            GemmGN1PerBlockGN11,
-            GemmKPerBlock,
-            GemmM1PerThreadM111,
-            GemmN1PerThreadN111,
-            GemmKPerThread,
-            GemmM11N11ThreadClusterM1100,
-            GemmM11N11ThreadClusterN1100,
-            GemmM11N11ThreadClusterM1101,
-            GemmM11N11ThreadClusterN1101,
-            GemmABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
-            GemmABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
+            decltype(in_grid_desc_gk0_gn0_gn1_gk1),
+            decltype(out_grid_desc_gm0_gm1_gn0_gn1),
+            GM1PerBlockGM11,
+            GN1PerBlockGN11,
+            GK0PerBlock,
+            BM1PerThreadBM11,
+            BN1PerThreadBN11,
+            BK0PerThread,
+            BM10BN10ThreadClusterBM100,
+            BM10BN10ThreadClusterBN100,
+            BM10BN10ThreadClusterBM101,
+            BM10BN10ThreadClusterBN101,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
            Sequence<1, 2, 3, 0, 4>, // ABlockTransferThreadClusterArrangeOrder
            Sequence<3, 2, 1, 0, 4>, // ABlockTransferSrcAccessOrder
-            GemmABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
-            GemmABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
            Sequence<0, 1, 2, 3, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
-            GemmBBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
-            GemmBBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
            Sequence<0, 4, 1, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
            Sequence<4, 3, 2, 0, 1>, // BBlockTransferSrcAccessOrder
-            GemmBBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
-            GemmBBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
            Sequence<0, 1, 2, 3, 4>,    // BBlockTransferSrcVectorTensorContiguousDimOrder
            Sequence<3, 4, 5, 0, 1, 2>, // CThreadTransferSrcDstAccessOrder
            5,                          // CThreadTransferSrcDstVectorDim
-            GemmCThreadTransferDstScalarPerVector_BN1,
+            CThreadTransferDstScalarPerVector_BN1,
            decltype(wei_grid_iterator_hacks),
            decltype(in_grid_iterator_hacks),
            decltype(out_grid_iterator_hacks),
@@ -227,9 +227,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
-            wei_gk0_gm0_gm1_gk1_grid_desc,
-            in_gk0_gn0_gn1_gk1_grid_desc,
-            out_gm0_gm1_gn0_gn1_grid_desc,
+            wei_grid_desc_gk0_gm0_gm1_gk1,
+            in_grid_desc_gk0_gn0_gn1_gk1,
+            out_grid_desc_gm0_gm1_gn0_gn1,
            wei_grid_iterator_hacks,
            in_grid_iterator_hacks,
            out_grid_iterator_hacks,
@@ -238,7 +238,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5r2_nchw_kcyx_nkhw(
            nrepeat);

        float perf = (float)calculate_convolution_flops(
-                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                         in_desc_n_c_hi_wi, wei_desc_k_c_y_x, out_desc_n_k_ho_wo) /
                     (std::size_t(1000) * 1000 * 1000) / ave_time;

        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;

--- a/host/driver_online/CMakeLists.txt
+++ b/host/driver_online/CMakeLists.txt
+include_directories(BEFORE
+    include
+    ${PROJECT_BINARY_DIR}/host/online_compilation/include
+    ${PROJECT_SOURCE_DIR}/host/online_compilation/include
+    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
+    ${PROJECT_SOURCE_DIR}/external/rocm/include
+    ${PROJECT_SOURCE_DIR}/external/half/include
+)
+
+set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
+
+add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
+
+target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
+target_link_libraries(conv_fwd_driver_online PRIVATE online_compilation)
--- a/driver/conv_driver_v2_olc.cpp
+++ b/driver/conv_driver_v2_olc.cpp
@@ -12,26 +12,22 @@
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_tensor.hpp"
-
-#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp"
-
-#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "handle.hpp"
+#include "hipCheck.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"

 #define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R5_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 1
 #define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
 #define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1

-#include "conv_tunables.hpp"
-#include "handle.hpp"
-#include "hipCheck.hpp"
-
 enum ConvForwardAlgo
 {
    V4R4NCHW,    // 0
-    V4R5NCHW,    // 1
+    V6R1NCHW,    // 1
    V4R4XDLNCHW, // 2
    V4R4XDLNHWC  // 3
 };
@@ -94,15 +90,17 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;

 #if 1
-    constexpr index_t in_vector_size = 1;
-    using in_data_t                  = float;
-    using acc_data_t                 = float;
-    using out_data_t                 = float;
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
 #elif 1
-    constexpr index_t in_vector_size = 16;
-    using in_data_t                  = int8_t;
-    using acc_data_t                 = int32_t;
-    using out_data_t                 = int8_t;
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
 #endif

    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
@@ -230,9 +228,9 @@ int main(int argc, char* argv[])
        tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable =
            &default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;

-        device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc<in_data_t,
-                                                                                 acc_data_t,
-                                                                                 out_data_t>(
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
+                                                                                    acc_data_t,
+                                                                                    out_data_t>(
            handle,
            tmp[I0],
            tmp[I1],
@@ -249,8 +247,8 @@ int main(int argc, char* argv[])
    }
 #endif

-#if USE_CONV_FWD_V4R5_NCHW
-    if(algo == ConvForwardAlgo::V4R5NCHW)
+#if USE_CONV_FWD_V6R1_NCHW
+    if(algo == ConvForwardAlgo::V6R1NCHW)
    {
        if(layout != ConvTensorLayout::NCHW)
        {
@@ -259,12 +257,11 @@ int main(int argc, char* argv[])

        const auto tmp = f_make_for_device_nchw();

-        tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
+        const auto tunable = tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw{};

-        device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc<in_data_t,
-                                                                                 acc_data_t,
-                                                                                 out_data_t>(
+        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw<in_data_t,
+                                                                                    acc_data_t,
+                                                                                    out_data_t>(
            handle,
            tmp[I0],
            tmp[I1],
@@ -294,22 +291,22 @@ int main(int argc, char* argv[])
        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;

-        device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_olc<in_data_t,
-                                                                                        acc_data_t,
-                                                                                        out_data_t>(
-            handle,
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            conv_strides,
-            conv_dilations,
-            in_left_pads,
-            in_right_pads,
-            in,
-            wei,
-            out_device,
-            tunable,
-            nrepeat);
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        tunable,
+                        nrepeat);
    }
 #endif

@@ -326,22 +323,22 @@ int main(int argc, char* argv[])
        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;

-        device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_olc<in_data_t,
-                                                                                        acc_data_t,
-                                                                                        out_data_t>(
-            handle,
-            tmp[I0],
-            tmp[I1],
-            tmp[I2],
-            conv_strides,
-            conv_dilations,
-            in_left_pads,
-            in_right_pads,
-            in,
-            wei,
-            out_device,
-            tunable,
-            nrepeat);
+        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(handle,
+                        tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        conv_strides,
+                        conv_dilations,
+                        in_left_pads,
+                        in_right_pads,
+                        in,
+                        wei,
+                        out_device,
+                        tunable,
+                        nrepeat);
    }
 #endif


--- a/host/driver_online/include/conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp
+#ifndef CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
+{
+    int32_t BlockSize;
+
+    int32_t MPerBlock;
+    int32_t NPerBlock;
+    int32_t KPerBlock;
+
+    int32_t M1PerThread;
+    int32_t N1PerThread;
+    int32_t KPerThread;
+
+    int32_t M1N1ThreadClusterM10;
+    int32_t M1N1ThreadClusterN10;
+    int32_t M1N1ThreadClusterM11;
+    int32_t M1N1ThreadClusterN11;
+
+    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
+    int32_t ABlockTransferSrcVectorDim;
+    int32_t ABlockTransferSrcScalarPerVector;
+    int32_t ABlockTransferDstScalarPerVector_M1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
+    int32_t BBlockTransferSrcVectorDim;
+    int32_t BBlockTransferSrcScalarPerVector;
+    int32_t BBlockTransferDstScalarPerVector_N1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 6> CThreadTransferSrcDstAccessOrder;
+    int32_t CThreadTransferSrcDstVectorDim;
+    int32_t CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
+    256,       128,       128, 8, 4,         4,           1,
+    8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
+    {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
+    {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
+    5,         1};
+#endif
--- a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
+{
+    int32_t BlockSize;
+
+    int32_t MPerBlock;
+    int32_t NPerBlock;
+    int32_t KPerBlock;
+
+    int32_t MPerWave;
+    int32_t NPerWave;
+    int32_t K1;
+
+    int32_t MRepeat;
+    int32_t NRepeat;
+
+    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
+    int32_t ABlockTransferSrcVectorDim;
+    int32_t ABlockTransferSrcScalarPerVector;
+    int32_t ABlockTransferDstScalarPerVector_K1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
+    int32_t BBlockTransferSrcVectorDim;
+    int32_t BBlockTransferSrcScalarPerVector;
+    int32_t BBlockTransferDstScalarPerVector_K1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
+    int32_t CThreadTransferSrcDstVectorDim;
+    int32_t CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
+    default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw = {
+        256,                      // BlockSize
+        128,                      // MPerBlock,
+        128,                      // NPerBlock,
+        4,                        // KPerBlock,
+        32,                       // MPerWave,
+        32,                       // NPerWave,
+        4,                        // K1,
+        2,                        // MRepeat,
+        2,                        // NRepeat,
+        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
+        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
+        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
+        2,                        // ABlockTransferSrcVectorDim
+        1,                        // ABlockTransferSrcScalarPerVector,
+        4,                        // ABlockTransferDstScalarPerVector_K1,
+        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
+        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
+        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
+        {0, 2, 1},                // BBlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
+        1,                        // BBlockTransferSrcVectorDim
+        1,                        // BBlockTransferSrcScalarPerVector
+        4,                        // BBlockTransferDstScalarPerVector_K1
+        false,                    // BThreadTransferSrcResetCoordinateAfterRun
+        {3, 0, 1, 2, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
+        7,                        // CThreadTransferSrcDstVectorDim,
+        1                         // CThreadTransferDstScalarPerVector
+};
+#endif
--- a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+#ifndef CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
+#define CONV_TUNABLE_FWD_V4R4_XDLOPS_NHWC_KYXC_NHWK_HPP
+
+struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
+{
+    int32_t BlockSize;
+
+    int32_t MPerBlock;
+    int32_t NPerBlock;
+    int32_t KPerBlock;
+
+    int32_t MPerWave;
+    int32_t NPerWave;
+    int32_t K1;
+
+    int32_t MRepeat;
+    int32_t NRepeat;
+
+    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
+    int32_t ABlockTransferSrcVectorDim;
+    int32_t ABlockTransferSrcScalarPerVector;
+    int32_t ABlockTransferDstScalarPerVector_K1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
+    int32_t BBlockTransferSrcVectorDim;
+    int32_t BBlockTransferSrcScalarPerVector;
+    int32_t BBlockTransferDstScalarPerVector_K1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
+    int32_t CThreadTransferSrcDstVectorDim;
+    int32_t CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
+    default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk = {
+        256,                      // BlockSize
+        128,                      // MPerBlock,
+        128,                      // NPerBlock,
+        4,                        // KPerBlock,
+        32,                       // MPerWave,
+        32,                       // NPerWave,
+        4,                        // K1,
+        2,                        // MRepeat,
+        2,                        // NRepeat,
+        {1, 2, 4},                // ABlockTransferThreadSliceLengths_K0_M_K1,
+        {4, 64, 1},               // ABlockTransferThreadClusterLengths_K0_M_K1,
+        {1, 0, 2},                // ABlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // ABlockTransferSrcAccessOrder,
+        2,                        // ABlockTransferSrcVectorDim
+        4,                        // ABlockTransferSrcScalarPerVector,
+        4,                        // ABlockTransferDstScalarPerVector_K1,
+        false,                    // AThreadTransferSrcResetCoordinateAfterRun,
+        {1, 2, 4},                // BBlockTransferThreadSliceLengths_K0_N_K1,
+        {4, 64, 1},               // BBlockTransferThreadClusterLengths_K0_N_K1,
+        {1, 0, 2},                // BBlockTransferThreadClusterArrangeOrder,
+        {1, 0, 2},                // BBlockTransferSrcAccessOrder,
+        2,                        // BBlockTransferSrcVectorDim
+        4,                        // BBlockTransferSrcScalarPerVector
+        4,                        // BBlockTransferDstScalarPerVector_K1
+        false,                    // BThreadTransferSrcResetCoordinateAfterRun
+        {2, 3, 0, 1, 7, 5, 4, 6}, // CThreadTransferSrcDstAccessOrder
+        7,                        // CThreadTransferSrcDstVectorDim,
+        1                         // CThreadTransferDstScalarPerVector
+};
+#endif
--- a/host/driver_online/include/conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp
+#ifndef CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
+
+struct tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
+{
+    int32_t BlockSize = 256;
+
+    int32_t GN0 = 4;
+    int32_t GK1 = 1;
+
+    int32_t GM1PerBlockGM11 = 128;
+    int32_t GN1PerBlockGN11 = 32;
+    int32_t GK0PerBlock     = 8;
+
+    int32_t BM1PerThreadBM11 = 4;
+    int32_t BN1PerThreadBN11 = 4;
+    int32_t BK0PerThread     = 1;
+
+    int32_t BM10BN10ThreadClusterBM100 = 2;
+    int32_t BM10BN10ThreadClusterBN100 = 2;
+    int32_t BM10BN10ThreadClusterBM101 = 8;
+    int32_t BM10BN10ThreadClusterBN101 = 8;
+
+    std::array<int32_t, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {4, 1, 1, 1, 1};
+    std::array<int32_t, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
+        2, 1, 1, 128, 1};
+    std::array<int32_t, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        4, 1, 1, 1, 1};
+    std::array<int32_t, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        1, 1, 1, 1, 1};
+
+    std::array<int32_t, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {1, 4, 1, 1, 1};
+    std::array<int32_t, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
+        8, 1, 1, 32, 1};
+    std::array<int32_t, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        1, 1, 1, 1, 1};
+    std::array<int32_t, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        1, 1, 1, 1, 1};
+
+    int32_t CThreadTransferDstScalarPerVector = 1;
+};
+#endif
--- a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
 #include "device.hpp"
 #include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-
-#include "olc_driver_common.hpp"
-#include "conv_tunables.hpp"
-
-#include "handle.hpp"
+#include "conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp"

 namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {

@@ -211,7 +209,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc(
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
    olCompile::Handle* handle,
    const InLengths& in_n_c_hi_wi_lengths,
    const WeiLengths& wei_k_c_y_x_lengths,

--- a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
 #include "device.hpp"
 #include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
-
-#include "olc_driver_common.hpp"
-#include "conv_tunables.hpp"
-
-#include "handle.hpp"
+#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"

 namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {

@@ -208,7 +206,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_olc(
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
    olCompile::Handle* handle,
    const InLengths& in_n_c_hi_wi_lengths,
    const WeiLengths& wei_k_c_y_x_lengths,

--- a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
 #include "device.hpp"
 #include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-
-#include "olc_driver_common.hpp"
-#include "conv_tunables.hpp"
-
-#include "handle.hpp"
+#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"

 namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {

@@ -209,7 +207,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_olc(
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
    olCompile::Handle* handle,
    const InLengths& in_n_hi_wi_c_lengths,
    const WeiLengths& wei_k_y_x_c_lengths,

--- a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp
 #include "device.hpp"
 #include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
-
-#include "olc_driver_common.hpp"
-#include "conv_tunables.hpp"
-
-#include "handle.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp"

-namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw {
+namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw {

 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_network_config_string_from_types()
 {
-    std::string out;
+    std::string out("DAT_");

    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
@@ -24,80 +22,97 @@ static std::string get_network_config_string_from_types()
 };

 static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
 {
    std::string out("TUN_");

-    out += std::to_string(pt->BlockSize) + "_";
-
-    out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
-           std::to_string(pt->KPerThread) + "_";
-    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN11) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_";
-
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_";
-
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
-
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_";
-
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_";
-
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
-
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+    out += std::to_string(tunable.BlockSize) + "_";
+
+    out += std::to_string(tunable.GN0) + "x" + std::to_string(tunable.GK1) + "_";
+
+    out += std::to_string(tunable.GM1PerBlockGM11) + "x" + std::to_string(tunable.GN1PerBlockGN11) +
+           "x" + std::to_string(tunable.GK0PerBlock) + "_";
+
+    out += std::to_string(tunable.BM1PerThreadBM11) + "x" +
+           std::to_string(tunable.BN1PerThreadBN11) + "x" + std::to_string(tunable.BK0PerThread) +
+           "_";
+
+    out += std::to_string(tunable.BM10BN10ThreadClusterBM100) + "x" +
+           std::to_string(tunable.BM10BN10ThreadClusterBN100) + "x" +
+           std::to_string(tunable.BM10BN10ThreadClusterBM101) + "x" +
+           std::to_string(tunable.BM10BN10ThreadClusterBN101) + "_";
+
+    out += std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
+
+    out +=
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
+
+    out += std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+           "_";
+
+    out += std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
+           "x" +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+           "_";
+
+    out += std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
+
+    out +=
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
+
+    out += std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+           "_";
+
+    out += std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
+           "x" +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+           "_";
+
+    out += std::to_string(tunable.CThreadTransferDstScalarPerVector);

    return (out);
 };
@@ -108,114 +123,120 @@ static std::string get_definition_string_from_types()
    std::string out;

    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
-           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+           " -DCK_PARAM_ACC_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());

    return (out);
 };

 static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
 {
    std::string out;

-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-
-    out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) +
-           " -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
-           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
-           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
-
-    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
-           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
-           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
-           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
-
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]);
-
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]);
-
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[3]);
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(tunable.BlockSize);
+
+    out += " -DCK_PARAM_GN0=" + std::to_string(tunable.GN0);
+    out += " -DCK_PARAM_GK1=" + std::to_string(tunable.GK1);
+
+    out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(tunable.GM1PerBlockGM11) +
+           " -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(tunable.GN1PerBlockGN11) +
+           " -DCK_PARAM_GK0PerBlock=" + std::to_string(tunable.GK0PerBlock);
+
+    out += " -DCK_PARAM_BM1PerThreadBM11=" + std::to_string(tunable.BM1PerThreadBM11) +
+           " -DCK_PARAM_BN1PerThreadBN11=" + std::to_string(tunable.BN1PerThreadBN11) +
+           " -DCK_PARAM_BK0PerThread=" + std::to_string(tunable.BK0PerThread);
+
+    out += " -DCK_PARAM_BM10BN10ThreadClusterBM100=" +
+           std::to_string(tunable.BM10BN10ThreadClusterBM100) +
+           " -DCK_PARAM_BM10BN10ThreadClusterBN100=" +
+           std::to_string(tunable.BM10BN10ThreadClusterBN100) +
+           " -DCK_PARAM_BM10BN10ThreadClusterBM101=" +
+           std::to_string(tunable.BM10BN10ThreadClusterBM101) +
+           " -DCK_PARAM_BM10BN10ThreadClusterBN101=" +
+           std::to_string(tunable.BM10BN10ThreadClusterBN101);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]);

    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_GM11);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]);
-
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]);
-
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[3]);
+        " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
+           "," +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
+           "," +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
+           "," +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
+           "," +
+           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
+
+    out += " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
+           "," +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
+           "," +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
+           "," +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
+           "," +
+           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]);

    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_GN11);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
-
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+        " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
+           "," +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
+           "," +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
+           "," +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
+           "," +
+           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
+
+    out += " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
+           "," +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
+           "," +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
+           "," +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
+           "," +
+           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
+
    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
+           std::to_string(tunable.CThreadTransferDstScalarPerVector);

    return (out);
 };

-} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
+} // namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw

 template <typename TInWei,
          typename TAcc,
@@ -227,7 +248,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
+void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
    olCompile::Handle* handle,
    const InLengths& in_n_c_hi_wi_lengths,
    const WeiLengths& wei_k_c_y_x_lengths,
@@ -239,15 +260,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
    const Tensor<TInWei>& in_n_c_hi_wi,
    const Tensor<TInWei>& wei_k_c_y_x,
    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable,
+    const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable,
    ck::index_t nrepeat)
 {
    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
+    using namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw;
    using size_t = std::size_t;

-    constexpr index_t N0 = 4; // this could not be a tunable so far
-
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
    // hasDoubleTailKBlockLoop
@@ -264,25 +283,27 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
    const auto out_n_k_ho_wo_desc =
        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);

-    const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
-        wei_k_c_y_x_desc,
-        in_n_c_hi_wi_desc,
-        out_n_k_ho_wo_desc,
-        conv_strides,
-        conv_dilations,
-        in_left_pads,
-        in_right_pads);
+    const auto descs =
+        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                               in_n_c_hi_wi_desc,
+                                                                               out_n_k_ho_wo_desc,
+                                                                               conv_strides,
+                                                                               conv_dilations,
+                                                                               in_left_pads,
+                                                                               in_right_pads,
+                                                                               tunable.GN0,
+                                                                               tunable.GK1);

-    const auto a_gk_gm0_gm1_grid_desc      = descs[I0];
-    const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
+    const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
+    const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];

-    const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1);
-    const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3);
-    const auto GK  = a_gk_gm0_gm1_grid_desc.GetLength(I0);
+    const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+    const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+    const auto GK  = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);

-    const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11);
-    const bool hasMainKBlockLoop       = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0);
+    const index_t grid_size = (GM1 / tunable.GM1PerBlockGM11) * (GN1 / tunable.GN1PerBlockGN11);
+    const bool hasMainKBlockLoop = ((GK + tunable.GK0PerBlock) / (2 * tunable.GK0PerBlock) > 1);
+    const bool hasDoubleTailKBlockLoop = ((GK / tunable.GK0PerBlock) % 2 == 0);

    ///////////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -299,20 +320,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
    // workspace API
    DeviceMem workspace_buf(4096);

-    void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf =
+    void* a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf =
        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf =
+    void* c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf =
        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf =
+    void* c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf =
        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);

-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable.BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable.BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable.BlockSize), 1, 1};

-    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp";
-    std::string algo_name    = "implicit_gemm_conv_fwd_v4r4_nchw";
+    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.cpp";
+    std::string algo_name    = "implicit_gemm_conv_fwd_v6r1_nchw";

    std::string param = " -std=c++17 ";
    std::string network_config;
@@ -320,10 +341,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
    param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
-             " -DCK_PARAM_N0=" + std::to_string(N0) + " " +
             get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_V" +
-                     std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" +
+
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
+                     std::to_string(hasDoubleTailKBlockLoop) + "_" +
                     get_network_config_string_from_tunable(tunable);

    std::vector<float> kernel1_times;
@@ -334,7 +355,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
        KernelTimer timer1, timer2;
        std::string kernel_name;

-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare";
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw_prepare";
        auto network_config_1 = network_config + "_1";

        timer1.Start();
@@ -354,13 +375,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
            in_left_pads[I1],
            in_right_pads[I0],
            in_right_pads[I1],
-            a_gk_gm0_gm10_gm11_grid_desc_dev_buf,
-            b_gk_gn0_gn10_gn11_grid_desc_dev_buf,
-            c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf,
-            c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf);
+            a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf,
+            b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf,
+            c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf,
+            c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf);
        timer2.End();

-        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw";
+        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw";
        auto network_config_2 = network_config + "_2";

        timer2.Start();
@@ -368,10 +389,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf),
-            (const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf),
-            (const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf));
+            (const void*)(a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf),
+            (const void*)(b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf),
+            (const void*)(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf),
+            (const void*)(c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf));
        timer2.End();

        kernel1_times.push_back(timer1.GetElapsedTime());

--- a/driver/include/olc_driver_common.hpp
+++ b/driver/include/olc_driver_common.hpp
--- a/host/host_tensor/CMakeLists.txt
+++ b/host/host_tensor/CMakeLists.txt
+include_directories(BEFORE
+    include
+)
+
+set(HOST_TENSOR_SOURCE
+    src/host_tensor.cpp;
+    src/device.cpp;
+)
+
+## the library target
+add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
+
+target_link_libraries(host_tensor PRIVATE hip::device)
+target_link_libraries(host_tensor INTERFACE hip::host)
+
+target_compile_features(host_tensor PUBLIC)
+set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS host_tensor LIBRARY DESTINATION lib) 
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
--- a/driver/include/device.hpp
+++ b/driver/include/device.hpp
@@ -2,7 +2,8 @@
 #define DEVICE_HPP

 #include <memory>
-#include "config.hpp"
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"

 struct DeviceMem
 {
@@ -30,7 +31,6 @@ struct KernelTimer
    std::unique_ptr<KernelTimerImpl> impl;
 };

-#if CK_DEVICE_BACKEND_AMD
 using device_stream_t = hipStream_t;

 template <typename... Args, typename F>
@@ -83,44 +83,4 @@ float launch_and_time_kernel(F kernel,
    return timer.GetElapsedTime() / nrepeat;
 }

-#elif CK_DEVICE_BACKEND_NVIDIA
-using device_stream_t = cudaStream_t;
-
-template <typename... Args, typename F>
-void launch_kernel(F kernel,
-                   dim3 grid_dim,
-                   dim3 block_dim,
-                   std::size_t lds_byte,
-                   cudaStream_t stream_id,
-                   Args... args)
-{
-    const void* f  = reinterpret_cast<const void*>(kernel);
-    void* p_args[] = {&args...};
-
-    cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, stream_id);
-}
-
-template <typename... Args, typename F>
-float launch_and_time_kernel(F kernel,
-                             dim3 grid_dim,
-                             dim3 block_dim,
-                             std::size_t lds_byte,
-                             cudaStream_t stream_id,
-                             Args... args)
-{
-    KernelTimer timer;
-
-    const void* f  = reinterpret_cast<const void*>(kernel);
-    void* p_args[] = {&args...};
-
-    timer.Start();
-
-    cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, stream_id);
-
-    timer.End();
-
-    return timer.GetElapsedTime();
-}
-#endif
-
 #endif
--- a/driver/include/device_tensor.hpp
+++ b/driver/include/device_tensor.hpp
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
--- a/driver/include/host_conv_bwd_data.hpp
+++ b/driver/include/host_conv_bwd_data.hpp
--- a/driver/include/host_tensor.hpp
+++ b/driver/include/host_tensor.hpp