Merge pull request #16 from ROCmSoftwarePlatform/develop

Merge develop into master

Merge pull request #16 from ROCmSoftwarePlatform/develop
Merge develop into master
31b40352 · Chao Liu · GitHub · 5781adf5 · b62bf8c3 · 31b40352
Unverified Commit 31b40352 authored Aug 18, 2021 by Chao Liu Committed by GitHub Aug 18, 2021
20 changed files
--- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
+++ b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r2.hpp
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
+#ifndef DRIVER_GEMM_DLOPS_V1R2
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2
+#define DRIVER_GEMM_DLOPS_V1R2
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r2.hpp"
+#include "gridwise_gemm_dlops_v1r2.hpp"
 template <ck::index_t BlockSize,
          typename FloatAB,
@@ -43,23 +43,23 @@ template <ck::index_t BlockSize,
          typename CThreadTransferSrcDstAccessOrder,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
+          typename AGridStepHacks,
-          typename BGridIteratorHacks,
+          typename BGridStepHacks,
-          typename CGridIteratorHacks,
+          typename CGridStepHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
+          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
+__host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
+                                      const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
+                                      FloatC* p_c_grid,
-                                              const AKMGridDesc& a_k_m_grid_desc,
+                                      const AKMGridDesc& a_k_m_grid_desc,
-                                              const BKNGridDesc& b_k_n_grid_desc,
+                                      const BKNGridDesc& b_k_n_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
+                                      AGridStepHacks,
-                                              BGridIteratorHacks,
+                                      BGridStepHacks,
-                                              CGridIteratorHacks,
+                                      CGridStepHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
+                                      AGridMoveSliceWindowStepHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
+                                      BGridMoveSliceWindowStepHacks,
-                                              ck::index_t nrepeat)
+                                      ck::index_t nrepeat)
 {
    using namespace ck;
@@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    constexpr auto I5 = Number<5>{};
    // GEMM
-    using GridwiseGemm =
+    using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize,
+                                                         FloatAB,
-                                               FloatAB,
+                                                         FloatAcc,
-                                               FloatAcc,
+                                                         FloatC,
-                                               FloatC,
+                                                         CGlobalMemoryDataOperation,
-                                               CGlobalMemoryDataOperation,
+                                                         AKMGridDesc,
-                                               AKMGridDesc,
+                                                         BKNGridDesc,
-                                               BKNGridDesc,
+                                                         CMNGridDesc,
-                                               CMNGridDesc,
+                                                         MPerBlock,
-                                               MPerBlock,
+                                                         NPerBlock,
-                                               NPerBlock,
+                                                         KPerBlock,
-                                               KPerBlock,
+                                                         M1PerThread,
-                                               M1PerThread,
+                                                         N1PerThread,
-                                               N1PerThread,
+                                                         KPerThread,
-                                               KPerThread,
+                                                         M1N1ThreadClusterM10,
-                                               M1N1ThreadClusterM10,
+                                                         M1N1ThreadClusterN10,
-                                               M1N1ThreadClusterN10,
+                                                         M1N1ThreadClusterM11,
-                                               M1N1ThreadClusterM11,
+                                                         M1N1ThreadClusterN11,
-                                               M1N1ThreadClusterN11,
+                                                         ABlockTransferThreadSliceLengths_K_M0_M1,
-                                               ABlockTransferThreadSliceLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterLengths_K_M0_M1,
-                                               ABlockTransferThreadClusterLengths_K_M0_M1,
+                                                         ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferThreadClusterArrangeOrder,
+                                                         ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcAccessOrder,
+                                                         ABlockTransferSrcVectorDim,
-                                               ABlockTransferSrcVectorDim,
+                                                         ABlockTransferSrcScalarPerVector,
-                                               ABlockTransferSrcScalarPerVector,
+                                                         ABlockTransferDstScalarPerVector_M1,
-                                               ABlockTransferDstScalarPerVector_M1,
+                                                         AThreadTransferSrcResetCoordinateAfterRun,
-                                               AThreadTransferSrcResetCoordinateAfterRun,
+                                                         BBlockTransferThreadSliceLengths_K_N0_N1,
-                                               BBlockTransferThreadSliceLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterLengths_K_N0_N1,
-                                               BBlockTransferThreadClusterLengths_K_N0_N1,
+                                                         BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferThreadClusterArrangeOrder,
+                                                         BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcAccessOrder,
+                                                         BBlockTransferSrcVectorDim,
-                                               BBlockTransferSrcVectorDim,
+                                                         BBlockTransferSrcScalarPerVector,
-                                               BBlockTransferSrcScalarPerVector,
+                                                         BBlockTransferDstScalarPerVector_N1,
-                                               BBlockTransferDstScalarPerVector_N1,
+                                                         BThreadTransferSrcResetCoordinateAfterRun,
-                                               BThreadTransferSrcResetCoordinateAfterRun,
+                                                         CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstAccessOrder,
+                                                         CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferSrcDstVectorDim,
+                                                         CThreadTransferDstScalarPerVector,
-                                               CThreadTransferDstScalarPerVector,
+                                                         AGridStepHacks,
-                                               AGridIteratorHacks,
+                                                         BGridStepHacks,
-                                               BGridIteratorHacks,
+                                                         CGridStepHacks,
-                                               CGridIteratorHacks,
+                                                         AGridMoveSliceWindowStepHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
+                                                         BGridMoveSliceWindowStepHacks>;
-                                               BGridMoveSliceWindowIteratorHacks>;
    const auto M = a_k_m_grid_desc.GetLength(I1);
    const auto N = b_k_n_grid_desc.GetLength(I1);
@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
    {
-        throw std::runtime_error(
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting");
    }
    const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
@@ -174,22 +172,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    if(has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -201,22 +198,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -228,22 +224,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -255,22 +250,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    else
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -299,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
    if(has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -315,27 +309,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -343,27 +338,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -371,27 +367,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm,
+            kernel_gemm_dlops_v1r2<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AKM0M1GridDesc>,
+                                   remove_reference_t<AKM0M1GridDesc>,
-                                           remove_reference_t<BKN0N1GridDesc>,
+                                   remove_reference_t<BKN0N1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -399,14 +396,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    return ave_time;

--- a/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
+++ b/host/driver_offline/include/driver_dynamic_gemm_dlops_v1r3.hpp
-#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
+#ifndef DRIVER_GEMM_DLOPS_V1R3
-#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3
+#define DRIVER_GEMM_DLOPS_V1R3
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_dlops_v1r3.hpp"
+#include "gridwise_gemm_dlops_v1r3.hpp"
 template <ck::index_t BlockSize,
          typename FloatAB,
@@ -39,23 +39,23 @@ template <ck::index_t BlockSize,
          typename CThreadTransferSrcDstAccessOrder,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
+          typename AGridStepHacks,
-          typename BGridIteratorHacks,
+          typename BGridStepHacks,
-          typename CGridIteratorHacks,
+          typename CGridStepHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
+          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowIteratorHacks>
+          typename BGridMoveSliceWindowStepHacks>
-__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
+__host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
-                                              const FloatAB* p_b_grid,
+                                      const FloatAB* p_b_grid,
-                                              FloatC* p_c_grid,
+                                      FloatC* p_c_grid,
-                                              const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                      const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                              const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                      const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                              const CMNGridDesc& c_m_n_grid_desc,
+                                      const CMNGridDesc& c_m_n_grid_desc,
-                                              AGridIteratorHacks,
+                                      AGridStepHacks,
-                                              BGridIteratorHacks,
+                                      BGridStepHacks,
-                                              CGridIteratorHacks,
+                                      CGridStepHacks,
-                                              AGridMoveSliceWindowIteratorHacks,
+                                      AGridMoveSliceWindowStepHacks,
-                                              BGridMoveSliceWindowIteratorHacks,
+                                      BGridMoveSliceWindowStepHacks,
-                                              ck::index_t nrepeat)
+                                      ck::index_t nrepeat)
 {
    using namespace ck;
@@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    // GEMM
    using GridwiseGemm =
-        GridwiseDynamicGemmDlops_km_kn_mn_v1r3<BlockSize,
+        GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
-                                               FloatAB,
+                                        FloatAB,
-                                               FloatAcc,
+                                        FloatAcc,
-                                               FloatC,
+                                        FloatC,
-                                               CGlobalMemoryDataOperation,
+                                        CGlobalMemoryDataOperation,
-                                               AK0MK1GridDesc,
+                                        AK0MK1GridDesc,
-                                               BK0NK1GridDesc,
+                                        BK0NK1GridDesc,
-                                               CMNGridDesc,
+                                        CMNGridDesc,
-                                               MPerBlock,
+                                        MPerBlock,
-                                               NPerBlock,
+                                        NPerBlock,
-                                               KPerBlock,
+                                        KPerBlock,
-                                               M1PerThread,
+                                        M1PerThread,
-                                               N1PerThread,
+                                        N1PerThread,
-                                               KPerThread,
+                                        KPerThread,
-                                               M1N1ThreadClusterM1Xs,
+                                        M1N1ThreadClusterM1Xs,
-                                               M1N1ThreadClusterN1Xs,
+                                        M1N1ThreadClusterN1Xs,
-                                               ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
-                                               ABlockTransferThreadClusterArrangeOrder,
+                                        ABlockTransferThreadClusterArrangeOrder,
-                                               ABlockTransferSrcAccessOrder,
+                                        ABlockTransferSrcAccessOrder,
-                                               ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
-                                               ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                        ABlockTransferSrcVectorTensorContiguousDimOrder,
-                                               ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
-                                               BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
-                                               BBlockTransferThreadClusterArrangeOrder,
+                                        BBlockTransferThreadClusterArrangeOrder,
-                                               BBlockTransferSrcAccessOrder,
+                                        BBlockTransferSrcAccessOrder,
-                                               BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
-                                               BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                        BBlockTransferSrcVectorTensorContiguousDimOrder,
-                                               BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
-                                               CThreadTransferSrcDstAccessOrder,
+                                        CThreadTransferSrcDstAccessOrder,
-                                               CThreadTransferSrcDstVectorDim,
+                                        CThreadTransferSrcDstVectorDim,
-                                               CThreadTransferDstScalarPerVector,
+                                        CThreadTransferDstScalarPerVector,
-                                               AGridIteratorHacks,
+                                        AGridStepHacks,
-                                               BGridIteratorHacks,
+                                        BGridStepHacks,
-                                               CGridIteratorHacks,
+                                        CGridStepHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
+                                        AGridMoveSliceWindowStepHacks,
-                                               BGridMoveSliceWindowIteratorHacks>;
+                                        BGridMoveSliceWindowStepHacks>;
    const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
    const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
    {
-        throw std::runtime_error(
+        throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
-            "wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting");
    }
    const auto a_k0_m0_m1_k1_grid_desc =
@@ -170,22 +169,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    if(has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -197,22 +195,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -224,22 +221,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -251,22 +247,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    else
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(kernel,
                                          nrepeat,
                                          dim3(grid_size),
                                          dim3(BlockSize),
                                          0,
-                                          0,
                                          p_a_grid,
                                          p_b_grid,
                                          p_c_grid,
@@ -295,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
    if(has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -311,27 +306,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           true,
+                                   true,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -339,27 +337,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           true>;
+                                   true>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -367,27 +368,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    else
    {
        const auto kernel =
-            kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm,
+            kernel_gemm_dlops_v1r3<GridwiseGemm,
-                                           FloatAB,
+                                   FloatAB,
-                                           FloatC,
+                                   FloatC,
-                                           remove_reference_t<AK0M0M1K1GridDesc>,
+                                   remove_reference_t<AK0M0M1K1GridDesc>,
-                                           remove_reference_t<BK0N0N1K1GridDesc>,
+                                   remove_reference_t<BK0N0N1K1GridDesc>,
-                                           remove_reference_t<CM0M10M11N0N10N11GridDesc>,
+                                   remove_reference_t<CM0M10M11N0N10N11GridDesc>,
-                                           remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
+                                   remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
-                                           false,
+                                   false,
-                                           false>;
+                                   false>;
        ave_time = launch_and_time_kernel(
            kernel,
@@ -395,14 +399,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
            dim3(grid_size),
            dim3(BlockSize),
            0,
-            0,
            p_a_grid,
            p_b_grid,
            p_c_grid,
-            (void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+                a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-            (void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(),
+            cast_pointer_to_constant_address_space(
-            (void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
+                b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
+            cast_pointer_to_constant_address_space(
+                c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
    }
    return ave_time;

--- a/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ b/host/driver_offline/include/driver_dynamic_gemm_xdlops_v2r3.hpp
-#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+#ifndef DRIVER_GEMM_XDLOPS_V2R3
-#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+#define DRIVER_GEMM_XDLOPS_V2R3
 #include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor_helper.hpp"
-#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
 template <ck::index_t BlockSize,
          typename FloatAB,
@@ -41,24 +41,24 @@ template <ck::index_t BlockSize,
          typename CThreadTransferSrcDstAccessOrder,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector,
-          typename AGridIteratorHacks,
+          typename AGridStepHacks,
-          typename BGridIteratorHacks,
+          typename BGridStepHacks,
-          typename CGridIteratorHacks,
+          typename CGridStepHacks,
-          typename AGridMoveSliceWindowIteratorHacks,
+          typename AGridMoveSliceWindowStepHacks,
-          typename BGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowStepHacks,
          bool CAccessOrderMRepeatNRepeat>
-__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
+__host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
-                                               const FloatAB* p_b_grid,
+                                       const FloatAB* p_b_grid,
-                                               FloatC* p_c_grid,
+                                       FloatC* p_c_grid,
-                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                       const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
-                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                       const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
-                                               const CMNGridDesc& c_m_n_grid_desc,
+                                       const CMNGridDesc& c_m_n_grid_desc,
-                                               AGridIteratorHacks,
+                                       AGridStepHacks,
-                                               BGridIteratorHacks,
+                                       BGridStepHacks,
-                                               CGridIteratorHacks,
+                                       CGridStepHacks,
-                                               AGridMoveSliceWindowIteratorHacks,
+                                       AGridMoveSliceWindowStepHacks,
-                                               BGridMoveSliceWindowIteratorHacks,
+                                       BGridMoveSliceWindowStepHacks,
-                                               ck::index_t nrepeat)
+                                       ck::index_t nrepeat)
 {
    using namespace ck;
@@ -66,52 +66,49 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
    using GridwiseGemm =
-        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
-                                                       FloatAB,
+                                                FloatAB,
-                                                       FloatAcc,
+                                                FloatAcc,
-                                                       FloatC,
+                                                FloatC,
-                                                       CGlobalMemoryDataOperation,
+                                                CGlobalMemoryDataOperation,
-                                                       AK0MK1GridDesc,
+                                                AK0MK1GridDesc,
-                                                       BK0NK1GridDesc,
+                                                BK0NK1GridDesc,
-                                                       CMNGridDesc,
+                                                CMNGridDesc,
-                                                       MPerBlock,
+                                                MPerBlock,
-                                                       NPerBlock,
+                                                NPerBlock,
-                                                       KPerBlock,
+                                                KPerBlock,
-                                                       MPerWave,
+                                                MPerWave,
-                                                       NPerWave,
+                                                NPerWave,
-                                                       K1,
+                                                K1,
-                                                       MRepeat,
+                                                MRepeat,
-                                                       NRepeat,
+                                                NRepeat,
-                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                ABlockTransferThreadSliceLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
-                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferThreadClusterArrangeOrder,
-                                                       ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcAccessOrder,
-                                                       ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcVectorDim,
-                                                       ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferSrcScalarPerVector,
-                                                       ABlockTransferDstScalarPerVector_K1,
+                                                ABlockTransferDstScalarPerVector_K1,
-                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                BBlockTransferThreadSliceLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
-                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferThreadClusterArrangeOrder,
-                                                       BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcAccessOrder,
-                                                       BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcVectorDim,
-                                                       BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferSrcScalarPerVector,
-                                                       BBlockTransferDstScalarPerVector_K1,
+                                                BBlockTransferDstScalarPerVector_K1,
-                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                       CThreadTransferSrcDstAccessOrder,
+                                                CThreadTransferSrcDstAccessOrder,
-                                                       CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferSrcDstVectorDim,
-                                                       CThreadTransferDstScalarPerVector,
+                                                CThreadTransferDstScalarPerVector,
-                                                       AGridIteratorHacks,
+                                                AGridStepHacks,
-                                                       BGridIteratorHacks,
+                                                BGridStepHacks,
-                                                       CGridIteratorHacks,
+                                                CGridStepHacks,
-                                                       AGridMoveSliceWindowIteratorHacks,
+                                                AGridMoveSliceWindowStepHacks,
-                                                       BGridMoveSliceWindowIteratorHacks,
+                                                BGridMoveSliceWindowStepHacks,
-                                                       CAccessOrderMRepeatNRepeat>;
+                                                CAccessOrderMRepeatNRepeat>;
    {
        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
@@ -129,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
    {
        throw std::runtime_error(
-            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+            "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
    }
    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
@@ -142,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
-    const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm,
+    const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm,
-                                                        FloatAB,
+                                                FloatAB,
-                                                        FloatC,
+                                                FloatC,
-                                                        remove_reference_t<AK0MK1GridDesc>,
+                                                remove_reference_t<AK0MK1GridDesc>,
-                                                        remove_reference_t<BK0NK1GridDesc>,
+                                                remove_reference_t<BK0NK1GridDesc>,
-                                                        remove_reference_t<CM0M1M2NGridDesc>,
+                                                remove_reference_t<CM0M1M2NGridDesc>,
-                                                        remove_reference_t<CBlockClusterAdaptor>>;
+                                                remove_reference_t<CBlockClusterAdaptor>>;
 #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
    float ave_time = launch_and_time_kernel(kernel,
@@ -156,7 +153,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
                                            dim3(grid_size),
                                            dim3(BlockSize),
                                            0,
-                                            0,
                                            p_a_grid,
                                            p_b_grid,
                                            p_c_grid,
@@ -176,20 +172,19 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
    c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
    c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
-    float ave_time =
+    float ave_time = launch_and_time_kernel(
-        launch_and_time_kernel(kernel,
+        kernel,
-                               nrepeat,
+        nrepeat,
-                               dim3(grid_size),
+        dim3(grid_size),
-                               dim3(BlockSize),
+        dim3(BlockSize),
-                               0,
+        0,
-                               0,
+        p_a_grid,
-                               p_a_grid,
+        p_b_grid,
-                               p_b_grid,
+        p_c_grid,
-                               p_c_grid,
+        cast_pointer_to_constant_address_space(a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-                               (void CONSTANT*)a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+        cast_pointer_to_constant_address_space(b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
-                               (void CONSTANT*)b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer(),
+        cast_pointer_to_constant_address_space(c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer()),
-                               (void CONSTANT*)c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer(),
+        cast_pointer_to_constant_address_space(c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
-                               (void CONSTANT*)c_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
 #endif
    return ave_time;
 }

--- a/host/driver_offline/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/conv_bwd_driver_offline.cpp
@@ -12,10 +12,10 @@
 #include "conv_common.hpp"
 #include "host_conv_bwd_data.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_DYNAMIC_MODE 1
+#define USE_MODE 1
 #define USE_CONV_BWD_V4R1_XDL_NHWC 1
 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
    constexpr auto I5 = Number<5>{};
    constexpr auto I6 = Number<6>{};
-#if USE_DYNAMIC_MODE
+#if USE_MODE
    // dynamic mode
    if(argc != 22)
    {
@@ -46,29 +46,29 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
+    const bool do_verification      = std::stoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
+    const int init_method           = std::stoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
+    const bool do_log               = std::stoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
+    const int nrepeat               = std::stoi(argv[6]);
-    const index_t N  = atoi(argv[7]);
+    const index_t N  = std::stoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
+    const index_t K  = std::stoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
+    const index_t C  = std::stoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
+    const index_t Y  = std::stoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
+    const index_t X  = std::stoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
+    const index_t Hi = std::stoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
+    const index_t Wi = std::stoi(argv[13]);
-    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_h   = std::stoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
    const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -83,12 +83,12 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
-    const bool do_verification      = atoi(argv[3]);
+    const bool do_verification      = std::stoi(argv[3]);
-    const int init_method           = atoi(argv[4]);
+    const int init_method           = std::stoi(argv[4]);
-    const bool do_log               = atoi(argv[5]);
+    const bool do_log               = std::stoi(argv[5]);
-    const int nrepeat               = atoi(argv[6]);
+    const int nrepeat               = std::stoi(argv[6]);
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;
@@ -115,23 +115,19 @@ int main(int argc, char* argv[])
 #endif
 #if 0
-    constexpr index_t in_vector_size = 1;
    using in_data_t                  = float;
    using acc_data_t                 = float;
    using out_data_t                 = float;
 #elif 1
-    constexpr index_t in_vector_size = 1;
+    using in_data_t  = half_t;
-    using in_data_t                  = half_t;
+    using acc_data_t = float;
-    using acc_data_t                 = float;
+    using out_data_t = half_t;
-    using out_data_t                 = half_t;
 #endif
    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
        in_lengths_host[0]  = static_cast<std::size_t>(N);
        in_lengths_host[1]  = static_cast<std::size_t>(C);
        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -144,9 +140,9 @@ int main(int argc, char* argv[])
        out_lengths_host[1] = static_cast<std::size_t>(K);
        out_lengths_host[2] = static_cast<std::size_t>(Ho);
        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
+    }
-    case ConvTensorLayout::NHWC:
+    else if(layout == ConvTensorLayout::NHWC)
-        // NHWC
+    {
        in_lengths_host[0]  = static_cast<std::size_t>(N);
        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -159,8 +155,10 @@ int main(int argc, char* argv[])
        out_lengths_host[1] = static_cast<std::size_t>(Ho);
        out_lengths_host[2] = static_cast<std::size_t>(Wo);
        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
+    }
-    default: throw std::runtime_error("wrong! not implemented");
+    else
+    {
+        throw std::runtime_error("wrong! not implemented");
    }
    Tensor<in_data_t> in_host(in_lengths_host);
@@ -213,40 +211,8 @@ int main(int argc, char* argv[])
        wei.GenerateTensorValue(gen_wei, num_thread);
    }
-    auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
-        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
-        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
-        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
-        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
-        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
-        return make_tuple(in_lengths_dev,
-                          wei_lengths_dev,
-                          out_lengths_dev,
-                          conv_strides_dev,
-                          conv_dilations_dev,
-                          in_left_pads_dev,
-                          in_right_pads_dev);
-    };
    auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -277,8 +243,6 @@ int main(int argc, char* argv[])
                          in_right_pads_dev);
    };
-    const auto nhwc_desc = f_make_for_device_nhwc();
 #if USE_CONV_BWD_V4R1_XDL_NHWC
    if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
    {
@@ -289,20 +253,20 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nhwc();
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<
+        device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
-            in_data_t,
+                                                                                  acc_data_t,
-            acc_data_t,
+                                                                                  out_data_t>(
-            out_data_t>(tmp[I0],
+            tmp[I0],
-                        tmp[I1],
+            tmp[I1],
-                        tmp[I2],
+            tmp[I2],
-                        tmp[I3],
+            tmp[I3],
-                        tmp[I4],
+            tmp[I4],
-                        tmp[I5],
+            tmp[I5],
-                        tmp[I6],
+            tmp[I6],
-                        in_device,
+            in_device,
-                        wei,
+            wei,
-                        out,
+            out,
-                        nrepeat);
+            nrepeat);
    }
 #endif
@@ -316,20 +280,20 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nhwc();
-        device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<
+        device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
-            in_data_t,
+                                                                                    acc_data_t,
-            acc_data_t,
+                                                                                    out_data_t>(
-            out_data_t>(tmp[I0],
+            tmp[I0],
-                        tmp[I1],
+            tmp[I1],
-                        tmp[I2],
+            tmp[I2],
-                        tmp[I3],
+            tmp[I3],
-                        tmp[I4],
+            tmp[I4],
-                        tmp[I5],
+            tmp[I5],
-                        tmp[I6],
+            tmp[I6],
-                        in_device,
+            in_device,
-                        wei,
+            wei,
-                        out,
+            out,
-                        nrepeat);
+            nrepeat);
    }
 #endif

--- a/host/driver_offline/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/conv_fwd_driver_offline.cpp
@@ -12,17 +12,17 @@
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_tensor.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
-#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_DYNAMIC_MODE 1
+#define USE_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 1
 #define USE_CONV_FWD_V4R4R2_NHWC 1
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
    constexpr auto I5 = Number<5>{};
    constexpr auto I6 = Number<6>{};
-#if USE_DYNAMIC_MODE
+#if USE_MODE
    // dynamic mode
    if(argc != 22)
    {
@@ -58,29 +58,29 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
+    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
+    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
+    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
+    const int nrepeat             = std::stoi(argv[6]);
-    const index_t N  = atoi(argv[7]);
+    const index_t N  = std::stoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
+    const index_t K  = std::stoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
+    const index_t C  = std::stoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
+    const index_t Y  = std::stoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
+    const index_t X  = std::stoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
+    const index_t Hi = std::stoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
+    const index_t Wi = std::stoi(argv[13]);
-    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_h   = std::stoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
    const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -95,12 +95,12 @@ int main(int argc, char* argv[])
        exit(1);
    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
+    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
+    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
+    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
+    const int nrepeat             = std::stoi(argv[6]);
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;
@@ -142,10 +142,8 @@ int main(int argc, char* argv[])
    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
        in_lengths_host[0]  = static_cast<std::size_t>(N);
        in_lengths_host[1]  = static_cast<std::size_t>(C);
        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
@@ -158,9 +156,9 @@ int main(int argc, char* argv[])
        out_lengths_host[1] = static_cast<std::size_t>(K);
        out_lengths_host[2] = static_cast<std::size_t>(Ho);
        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
+    }
-    case ConvTensorLayout::NHWC:
+    else if(layout == ConvTensorLayout::NHWC)
-        // NHWC
+    {
        in_lengths_host[0]  = static_cast<std::size_t>(N);
        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
@@ -173,8 +171,10 @@ int main(int argc, char* argv[])
        out_lengths_host[1] = static_cast<std::size_t>(Ho);
        out_lengths_host[2] = static_cast<std::size_t>(Wo);
        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
+    }
-    default: throw std::runtime_error("wrong! not implemented");
+    else
+    {
+        std::runtime_error("wrong! not implemented");
    }
    Tensor<in_data_t> in(in_lengths_host);
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
    }
    auto f_make_for_device_nchw = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
    };
    auto f_make_for_device_nhwc = [&]() {
-#if USE_DYNAMIC_MODE
+#if USE_MODE
        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -301,20 +301,19 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nchw();
-        device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
+        device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
+                                                                           acc_data_t,
-                                                                                   out_data_t>(
+                                                                           out_data_t>(tmp[I0],
-            tmp[I0],
+                                                                                       tmp[I1],
-            tmp[I1],
+                                                                                       tmp[I2],
-            tmp[I2],
+                                                                                       tmp[I3],
-            tmp[I3],
+                                                                                       tmp[I4],
-            tmp[I4],
+                                                                                       tmp[I5],
-            tmp[I5],
+                                                                                       tmp[I6],
-            tmp[I6],
+                                                                                       in,
-            in,
+                                                                                       wei,
-            wei,
+                                                                                       out_device,
-            out_device,
+                                                                                       nrepeat);
-            nrepeat);
    }
 #endif
@@ -328,20 +327,19 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nhwc();
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
+        device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                     acc_data_t,
+                                                                             acc_data_t,
-                                                                                     out_data_t>(
+                                                                             out_data_t>(tmp[I0],
-            tmp[I0],
+                                                                                         tmp[I1],
-            tmp[I1],
+                                                                                         tmp[I2],
-            tmp[I2],
+                                                                                         tmp[I3],
-            tmp[I3],
+                                                                                         tmp[I4],
-            tmp[I4],
+                                                                                         tmp[I5],
-            tmp[I5],
+                                                                                         tmp[I6],
-            tmp[I6],
+                                                                                         in,
-            in,
+                                                                                         wei,
-            wei,
+                                                                                         out_device,
-            out_device,
+                                                                                         nrepeat);
-            nrepeat);
    }
 #endif
@@ -355,20 +353,19 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nchw();
-        device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
+        device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   acc_data_t,
+                                                                           acc_data_t,
-                                                                                   out_data_t>(
+                                                                           out_data_t>(tmp[I0],
-            tmp[I0],
+                                                                                       tmp[I1],
-            tmp[I1],
+                                                                                       tmp[I2],
-            tmp[I2],
+                                                                                       tmp[I3],
-            tmp[I3],
+                                                                                       tmp[I4],
-            tmp[I4],
+                                                                                       tmp[I5],
-            tmp[I5],
+                                                                                       tmp[I6],
-            tmp[I6],
+                                                                                       in,
-            in,
+                                                                                       wei,
-            wei,
+                                                                                       out_device,
-            out_device,
+                                                                                       nrepeat);
-            nrepeat);
    }
 #endif
@@ -382,21 +379,20 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nchw();
-        device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
+        device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                   16,
+                                                                           16,
-                                                                                   acc_data_t,
+                                                                           acc_data_t,
-                                                                                   out_data_t>(
+                                                                           out_data_t>(tmp[I0],
-            tmp[I0],
+                                                                                       tmp[I1],
-            tmp[I1],
+                                                                                       tmp[I2],
-            tmp[I2],
+                                                                                       tmp[I3],
-            tmp[I3],
+                                                                                       tmp[I4],
-            tmp[I4],
+                                                                                       tmp[I5],
-            tmp[I5],
+                                                                                       tmp[I6],
-            tmp[I6],
+                                                                                       in,
-            in,
+                                                                                       wei,
-            wei,
+                                                                                       out_device,
-            out_device,
+                                                                                       nrepeat);
-            nrepeat);
    }
 #endif
@@ -410,9 +406,9 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nchw();
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+        device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                                      acc_data_t,
+                                                                              acc_data_t,
-                                                                                      out_data_t>(
+                                                                              out_data_t>(
            tmp[I0],
            tmp[I1],
            tmp[I2],
@@ -437,9 +433,9 @@ int main(int argc, char* argv[])
        const auto tmp = f_make_for_device_nhwc();
-        device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
+        device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
-                                                                                      acc_data_t,
+                                                                              acc_data_t,
-                                                                                      out_data_t>(
+                                                                              out_data_t>(
            tmp[I0],
            tmp[I1],
            tmp[I2],
@@ -467,7 +463,6 @@ int main(int argc, char* argv[])
        check_error(out_host, out_device);
-#if 0
        if(do_log)
        {
            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
@@ -475,6 +470,5 @@ int main(int argc, char* argv[])
            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
        }
-#endif
    }
 }
--- a/host/driver_online/CMakeLists.txt
+++ b/host/driver_online/CMakeLists.txt
-include_directories(BEFORE
-    include
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/online_compile/include
-    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
-    ${PROJECT_SOURCE_DIR}/host/solver/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
-    ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
-    ${PROJECT_SOURCE_DIR}/external/rocm/include
-    ${PROJECT_SOURCE_DIR}/external/half/include
-)
-set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
-add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
-target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
-target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
--- a/host/driver_online/conv_fwd_driver_online.cpp
+++ b/host/driver_online/conv_fwd_driver_online.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_tensor.hpp"
-#include "handle.hpp"
-#include "hipCheck.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V6R1_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
-#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
-enum ConvForwardAlgo
-{
-    V4R4NCHW,    // 0
-    V6R1NCHW,    // 1
-    V4R4XDLNCHW, // 2
-    V4R4XDLNHWC  // 3
-};
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using size_t = std::size_t;
-    hipStream_t stream;
-    online_compile::Handle* handle;
-    MY_HIP_CHECK(hipStreamCreate(&stream));
-    handle = new online_compile::Handle(stream);
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    if(argc != 22)
-    {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
-        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
-        exit(1);
-    }
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
-    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
-    const bool do_verification    = atoi(argv[3]);
-    const int init_method         = atoi(argv[4]);
-    const bool do_log             = atoi(argv[5]);
-    const int nrepeat             = atoi(argv[6]);
-    const index_t N  = atoi(argv[7]);
-    const index_t K  = atoi(argv[8]);
-    const index_t C  = atoi(argv[9]);
-    const index_t Y  = atoi(argv[10]);
-    const index_t X  = atoi(argv[11]);
-    const index_t Hi = atoi(argv[12]);
-    const index_t Wi = atoi(argv[13]);
-    const index_t conv_stride_h   = atoi(argv[14]);
-    const index_t conv_stride_w   = atoi(argv[15]);
-    const index_t conv_dilation_h = atoi(argv[16]);
-    const index_t conv_dilation_w = atoi(argv[17]);
-    const index_t in_left_pad_h   = atoi(argv[18]);
-    const index_t in_left_pad_w   = atoi(argv[19]);
-    const index_t in_right_pad_h  = atoi(argv[20]);
-    const index_t in_right_pad_w  = atoi(argv[21]);
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-#if 1
-    using in_data_t  = float;
-    using acc_data_t = float;
-    using out_data_t = float;
-#elif 0
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
-#elif 1
-    using in_data_t  = int8_t;
-    using acc_data_t = int32_t;
-    using out_data_t = int8_t;
-#endif
-    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
-    switch(layout)
-    {
-    case ConvTensorLayout::NCHW:
-        // NCHW
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(C);
-        in_lengths_host[2] = static_cast<std::size_t>(Hi);
-        in_lengths_host[3] = static_cast<std::size_t>(Wi);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(C);
-        wei_lengths_host[2] = static_cast<std::size_t>(Y);
-        wei_lengths_host[3] = static_cast<std::size_t>(X);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(K);
-        out_lengths_host[2] = static_cast<std::size_t>(Ho);
-        out_lengths_host[3] = static_cast<std::size_t>(Wo);
-        break;
-    case ConvTensorLayout::NHWC:
-        // NHWC
-        in_lengths_host[0] = static_cast<std::size_t>(N);
-        in_lengths_host[1] = static_cast<std::size_t>(Hi);
-        in_lengths_host[2] = static_cast<std::size_t>(Wi);
-        in_lengths_host[3] = static_cast<std::size_t>(C);
-        wei_lengths_host[0] = static_cast<std::size_t>(K);
-        wei_lengths_host[1] = static_cast<std::size_t>(Y);
-        wei_lengths_host[2] = static_cast<std::size_t>(X);
-        wei_lengths_host[3] = static_cast<std::size_t>(C);
-        out_lengths_host[0] = static_cast<std::size_t>(N);
-        out_lengths_host[1] = static_cast<std::size_t>(Ho);
-        out_lengths_host[2] = static_cast<std::size_t>(Wo);
-        out_lengths_host[3] = static_cast<std::size_t>(K);
-        break;
-    default: throw std::runtime_error("wrong! not implemented");
-    }
-    Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
-    std::cout << "layout: " << layout << std::endl;
-    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
-    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
-    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
-    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
-    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
-    std::size_t num_thread = std::thread::hardware_concurrency();
-    switch(init_method)
-    {
-    case 0:
-        // no initialization
-        break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        break;
-    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        break;
-    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei.GenerateTensorValue(gen_wei, num_thread);
-    }
-    auto f_make_for_device_nchw = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
-        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
-        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-    auto f_make_for_device_nhwc = [&]() {
-        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
-        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
-        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
-        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
-    };
-    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
-    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
-    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
-    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
-#if USE_CONV_FWD_V4R4_NCHW
-    if(algo == ConvForwardAlgo::V4R4NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V6R1_NCHW
-    if(algo == ConvForwardAlgo::V6R1NCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-#if 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            1,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 1},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 1},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 0
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            2,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 2},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 2},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#elif 1
-        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
-            get_datatype_enum_from_type<in_data_t>::value,
-            get_datatype_enum_from_type<acc_data_t>::value,
-            get_datatype_enum_from_type<out_data_t>::value,
-            256,
-            4,
-            4,
-            128,
-            32,
-            8,
-            4,
-            4,
-            1,
-            {8, 2},
-            {8, 2},
-            {4, 1, 1, 1, 4},
-            {2, 1, 1, 128, 1},
-            {4, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            {1, 4, 1, 1, 4},
-            {8, 1, 1, 32, 1},
-            {1, 1, 1, 1, 1},
-            {1, 1, 1, 1, 1},
-            4,
-            true,
-            true};
-#endif
-        online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        compile_param,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
-    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
-    {
-        if(layout != ConvTensorLayout::NCHW)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nchw();
-        tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
-    if(algo == ConvForwardAlgo::V4R4XDLNHWC)
-    {
-        if(layout != ConvTensorLayout::NHWC)
-        {
-            throw std::runtime_error("wrong! layout");
-        }
-        const auto tmp = f_make_for_device_nhwc();
-        tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
-            &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-        online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
-            in_data_t,
-            acc_data_t,
-            out_data_t>(handle,
-                        tmp[I0],
-                        tmp[I1],
-                        tmp[I2],
-                        conv_strides,
-                        conv_dilations,
-                        in_left_pads,
-                        in_right_pads,
-                        in,
-                        wei,
-                        out_device,
-                        tunable,
-                        nrepeat);
-    }
-#endif
-    if(do_verification)
-    {
-        host_direct_convolution(in,
-                                wei,
-                                out_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
-        check_error(out_host, out_device);
-#if 0
-        if(do_log)
-        {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
-        }
-#endif
-    }
-    delete handle;
-    MY_HIP_CHECK(hipStreamDestroy(stream));
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
-namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
-           std::to_string(pt->KPerThread) + "_";
-    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
-           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
-           std::to_string(pt->M1N1ThreadClusterN11) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
-           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
-           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
-    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
-           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
-           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
-           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                        in_n_c_hi_wi_desc,
-                                                                        out_n_k_ho_wo_desc,
-                                                                        conv_strides,
-                                                                        conv_dilations,
-                                                                        in_left_pads,
-                                                                        in_right_pads);
-    const auto a_k_m_grid_desc = descs[I0];
-    const auto c_m_n_grid_desc = descs[I2];
-    const auto M               = c_m_n_grid_desc.GetLength(I0);
-    const auto N               = c_m_n_grid_desc.GetLength(I1);
-    const auto K               = a_k_m_grid_desc.GetLength(I0);
-    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
-             get_definition_string_from_tunable(tunable) +
-             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
-             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable) + "_" +
-                     std::to_string(hasMainKBlockLoop) + "_" +
-                     std::to_string(hasDoubleTailKBlockLoop);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-    const auto n  = in_n_c_hi_wi_desc.GetLength(I0);
-    const auto c  = in_n_c_hi_wi_desc.GetLength(I1);
-    const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
-    const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
-    const auto k  = wei_k_c_y_x_desc.GetLength(I0);
-    const auto y  = wei_k_c_y_x_desc.GetLength(I2);
-    const auto x  = wei_k_c_y_x_desc.GetLength(I3);
-    const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
-    const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k_n0_n1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
-             get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k_m0_m1_grid_desc_dev_buf,
-            b_k_n0_n1_grid_desc_dev_buf,
-            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
-            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
-            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
-#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
-namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out("TUN_");
-    out += std::to_string(pt->BlockSize) + "_";
-    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
-           std::to_string(pt->KPerBlock) + "_";
-    out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
-           std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
-           std::to_string(pt->K1) + "_";
-    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
-    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
-    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
-    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
-    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
-    out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
-    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
-    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
-    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    using namespace ck;
-    std::string out;
-    out +=
-        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
-        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
-    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
-           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
-           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
-    out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
-           " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
-           " -DCK_PARAM_K1=" + std::to_string(pt->K1) +
-           " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
-           " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
-    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->ABlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
-    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
-    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
-           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
-    out +=
-        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
-    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
-           std::to_string(pt->BBlockTransferSrcScalarPerVector);
-    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
-           std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
-    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
-           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
-    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
-           std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
-    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
-           std::to_string(pt->CThreadTransferSrcDstVectorDim);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(pt->CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
-    online_compile::Handle* handle,
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
-    using size_t = std::size_t;
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_hi_wi_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
-    const auto n  = in_n_hi_wi_c_desc.GetLength(I0);
-    const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
-    const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
-    const auto c  = in_n_hi_wi_c_desc.GetLength(I3);
-    const auto k = wei_k_y_x_c_desc.GetLength(I0);
-    const auto y = wei_k_y_x_c_desc.GetLength(I1);
-    const auto x = wei_k_y_x_c_desc.GetLength(I2);
-    const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
-    const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
-    const auto M  = k;
-    const auto N  = n * ho * wo;
-    const auto K  = c * y * x;
-    const auto K0 = K / tunable->K1;
-    const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-    in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_k0_n_k1_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_m0_m1_m2_n_grid_desc_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
-    param += get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     get_network_config_string_from_tunable(tunable);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name =
-            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
-            static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
-            static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_k0_m_k1_grid_desc_dev_buf,
-            b_k0_n_k1_grid_desc_dev_buf,
-            c_m0_m1_m2_n_grid_desc_dev_buf,
-            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_k0_m_k1_grid_desc_dev_buf),
-            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
-            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
-            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            (nrepeat - 1);
-        const auto N = in_n_hi_wi_c_lengths[I0];
-        const auto C = in_n_hi_wi_c_lengths[I3];
-        const auto Ho = out_n_ho_wo_k_lengths[I1];
-        const auto Wo = out_n_ho_wo_k_lengths[I2];
-        const auto K  = out_n_ho_wo_k_lengths[I3];
-        const auto Y = wei_k_y_x_c_lengths[I1];
-        const auto X = wei_k_y_x_c_lengths[I2];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / ave_time2;
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
-#pragma once
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "convolution_problem_descriptor.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
-    online_compile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace ck_driver;
-    using size_t = std::size_t;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
-                                                   out_n_k_ho_wo_lengths[I1],
-                                                   in_n_c_hi_wi_lengths[I1],
-                                                   wei_k_c_y_x_lengths[I2],
-                                                   wei_k_c_y_x_lengths[I3],
-                                                   in_n_c_hi_wi_lengths[I2],
-                                                   in_n_c_hi_wi_lengths[I3],
-                                                   out_n_k_ho_wo_lengths[I2],
-                                                   out_n_k_ho_wo_lengths[I3],
-                                                   conv_strides[I0],
-                                                   conv_strides[I1],
-                                                   conv_dilations[I0],
-                                                   conv_dilations[I1],
-                                                   in_left_pads[I0],
-                                                   in_left_pads[I1],
-                                                   in_right_pads[I0],
-                                                   in_right_pads[I1],
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TInWei>::value,
-                                                   get_datatype_enum_from_type<TOut>::value};
-    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
-                                                                   compile_param))
-    {
-        throw std::runtime_error("wrong! IsValidCompileParameter fail");
-    }
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // workspace is used for save transformed tensor descritpors created by prepare kernel
-    DeviceMem workspace_dev_buf(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
-    const auto block_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
-    const auto grid_size = std::size_t(
-        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
-    const std::vector<size_t> vld1 = {1, 1, 1};
-    const std::vector<size_t> vgd1 = {1, 1, 1};
-    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
-    std::string program_name =
-        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
-    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
-    std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
-    std::string network_config       = compile_param_string;
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat + 1; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name,
-                          network_config_1,
-                          program_name,
-                          kernel_name,
-                          vld1,
-                          vgd1,
-                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-                                                conv_strides[I0],
-                                                conv_strides[I1],
-                                                conv_dilations[I0],
-                                                conv_dilations[I1],
-                                                in_left_pads[I0],
-                                                in_left_pads[I1],
-                                                in_right_pads[I0],
-                                                in_right_pads[I1],
-                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer1.End();
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name,
-                          network_config_2,
-                          program_name,
-                          kernel_name,
-                          vld2,
-                          vgd2,
-                          compile_param_string)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 =
-            std::accumulate(
-                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        auto ave_time2 =
-            std::accumulate(
-                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
-            nrepeat;
-        float perf = (float)(conv_problem_desc.CalculateFlop()) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/host_tensor/CMakeLists.txt
+++ b/host/host_tensor/CMakeLists.txt
@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
 ## the library target
 add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) 
+target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_link_libraries(host_tensor PRIVATE hip::device)
 target_link_libraries(host_tensor INTERFACE hip::host)

--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
-#include "dynamic_tensor_descriptor.hpp"
+#include "tensor_descriptor.hpp"
 enum ConvTensorLayout
 {
@@ -19,8 +19,8 @@ template <typename... InDesc,
          typename LeftPads,
          typename RightPads>
 constexpr auto get_convolution_output_default_4d_tensor_descriptor(
-    const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
+    const ck::TensorDescriptor<InDesc...>& in_desc,
-    const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
+    const ck::TensorDescriptor<WeiDesc...>& wei_desc,
    const ConvStrides& conv_strides,
    const ConvDilations conv_dilations,
    const LeftPads& left_pads,
@@ -57,12 +57,12 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
    const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
    const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
-    return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+    return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
 }
 template <class InDesc, class WeiDesc, class OutDesc>
 constexpr std::size_t
-calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
+calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
 {
    using namespace ck;

--- a/host/host_tensor/include/device.hpp
+++ b/host/host_tensor/include/device.hpp
@@ -34,24 +34,16 @@ struct KernelTimer
 using device_stream_t = hipStream_t;
 template <typename... Args, typename F>
-void launch_kernel(F kernel,
+void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
-                   dim3 grid_dim,
-                   dim3 block_dim,
-                   std::size_t lds_byte,
-                   hipStream_t stream_id,
-                   Args... args)
 {
+    hipStream_t stream_id = nullptr;
    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
 }
 template <typename... Args, typename F>
-float launch_and_time_kernel(F kernel,
+float launch_and_time_kernel(
-                             int nrepeat,
+    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
-                             dim3 grid_dim,
-                             dim3 block_dim,
-                             std::size_t lds_byte,
-                             hipStream_t stream_id,
-                             Args... args)
 {
    KernelTimer timer;
@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel,
    printf("Warm up\n");
+    hipStream_t stream_id = nullptr;
    // warm up
    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
                             const ConvStrides& conv_strides,
                             const ConvDilations& conv_dilations,
                             const InLeftPads& in_left_pads,
-                             const InRightPads& in_right_pads,
+                             const InRightPads&,
                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
    using namespace ck;
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
        out(n, ho, wo, k) = v;
    };
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
    {
-    case ConvTensorLayout::NCHW:
        make_ParallelTensorFunctor(f_nchw,
                                   out.mDesc.GetLengths()[0],
                                   out.mDesc.GetLengths()[1],
                                   out.mDesc.GetLengths()[2],
                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
+    }
-    case ConvTensorLayout::NHWC:
+    else if(layout == ConvTensorLayout::NHWC)
+    {
        make_ParallelTensorFunctor(f_nhwc,
                                   out.mDesc.GetLengths()[0],
                                   out.mDesc.GetLengths()[1],
                                   out.mDesc.GetLengths()[2],
                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
+    }
-    default: throw std::runtime_error("wrong! not supported layout");
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
    }
 }
@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    constexpr std::size_t HoPerTile = 2;
    constexpr std::size_t WoPerTile = 2;
-    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
+    std::size_t N = in_nchw.mDesc.GetLengths()[0];
-    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
+    std::size_t C = in_nchw.mDesc.GetLengths()[1];
-    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
-    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
-    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+    std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
    index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
    index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    std::size_t HiPerTile = HoPerTile + Y - 1;
    std::size_t WiPerTile = WoPerTile + X - 1;
-    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
+    std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
-    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
+    std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});

--- a/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
                                           const ConvStrides& conv_strides,
                                           const ConvDilations& conv_dilations,
                                           const InLeftPads& in_left_pads,
-                                           const InRightPads& in_right_pads,
+                                           const InRightPads& /* in_right_pads */,
                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
    using namespace ck;
@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
    constexpr auto I3 = Number<3>{};
    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t C  = in.mDesc.GetLengths()[I1];
-        std::size_t Hi = in.mDesc.GetLengths()[I2];
-        std::size_t Wi = in.mDesc.GetLengths()[I3];
        std::size_t K = wei.mDesc.GetLengths()[I0];
        std::size_t Y = wei.mDesc.GetLengths()[I2];
        std::size_t X = wei.mDesc.GetLengths()[I3];
@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
    };
    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-        std::size_t N  = in.mDesc.GetLengths()[I0];
-        std::size_t Hi = in.mDesc.GetLengths()[I1];
-        std::size_t Wi = in.mDesc.GetLengths()[I2];
-        std::size_t C  = in.mDesc.GetLengths()[I3];
        std::size_t K = wei.mDesc.GetLengths()[I0];
        std::size_t Y = wei.mDesc.GetLengths()[I1];
        std::size_t X = wei.mDesc.GetLengths()[I2];
@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
        in(n, hi, wi, c) = v;
    };
-    switch(layout)
+    if(layout == ConvTensorLayout::NCHW)
    {
-    case ConvTensorLayout::NCHW:
        make_ParallelTensorFunctor(f_nchw,
                                   in.mDesc.GetLengths()[0],
                                   in.mDesc.GetLengths()[1],
                                   in.mDesc.GetLengths()[2],
                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
+    }
-    case ConvTensorLayout::NHWC:
+    else if(layout == ConvTensorLayout::NHWC)
+    {
        make_ParallelTensorFunctor(f_nhwc,
                                   in.mDesc.GetLengths()[0],
                                   in.mDesc.GetLengths()[1],
                                   in.mDesc.GetLengths()[2],
                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-        break;
+    }
-    default: throw std::runtime_error("wrong! not supported layout");
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
    }
 }
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
            first = false;
        else
            os << delim;
-        os << T{v};
+        os << static_cast<T>(v);
    }
    return os;
 }

--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -9,7 +9,7 @@ struct GeneratorTensor_1
    int value = 1;
    template <typename... Is>
-    float operator()(Is... is)
+    float operator()(Is...)
    {
        return value;
    }

--- a/host/host_tensor/src/device.cpp
+++ b/host/host_tensor/src/device.cpp
@@ -24,32 +24,32 @@ struct KernelTimerImpl
 {
    KernelTimerImpl()
    {
-        hipEventCreate(&mStart);
+        hipGetErrorString(hipEventCreate(&mStart));
-        hipEventCreate(&mEnd);
+        hipGetErrorString(hipEventCreate(&mEnd));
    }
    ~KernelTimerImpl()
    {
-        hipEventDestroy(mStart);
+        hipGetErrorString(hipEventDestroy(mStart));
-        hipEventDestroy(mEnd);
+        hipGetErrorString(hipEventDestroy(mEnd));
    }
    void Start()
    {
-        hipDeviceSynchronize();
+        hipGetErrorString(hipDeviceSynchronize());
-        hipEventRecord(mStart, 0);
+        hipGetErrorString(hipEventRecord(mStart, nullptr));
    }
    void End()
    {
-        hipEventRecord(mEnd, 0);
+        hipGetErrorString(hipEventRecord(mEnd, nullptr));
-        hipEventSynchronize(mEnd);
+        hipGetErrorString(hipEventSynchronize(mEnd));
    }
    float GetElapsedTime() const
    {
        float time;
-        hipEventElapsedTime(&time, mStart, mEnd);
+        hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
        return time;
    }

--- a/host/online_compile/CMakeLists.txt
+++ b/host/online_compile/CMakeLists.txt
-set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
-## for online-compiling of HIP kernels
-set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
-## reset to avoid the C++ options from the parent project
-set(CMAKE_CXX_FLAGS "")
-message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
-# look for and register clang-offload-bundler
-if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
-        PATH_SUFFIXES bin
-        PATHS
-	    /opt/rocm/llvm
-	    ${CMAKE_INSTALL_PREFIX}/llvm
-    )
-endif()
-if(OLC_OFFLOADBUNDLER_BIN)
-    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
-    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
-else()
-    # look for and register extractkernel
-    message(STATUS "clang-offload-bundler not found")
-    find_program(EXTRACTKERNEL_BIN extractkernel
-        PATH_SUFFIXES bin
-        PATHS
-            /opt/rocm/hip
-            /opt/rocm/hcc
-            /opt/rocm
-	    ${CMAKE_INSTALL_PREFIX}/hip
-            ${CMAKE_INSTALL_PREFIX}/hcc
-            ${CMAKE_INSTALL_PREFIX}
-    )
-    if(EXTRACTKERNEL_BIN)
-        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
-        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
-    else()
-        message(FATAL_ERROR "extractkernel not found")
-    endif()
-endif()
-option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
-set(BOOST_COMPONENTS filesystem)
-add_definitions(-DBOOST_ALL_NO_LIB=1)
-find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
-# HIP is always required
-find_package(hip REQUIRED PATHS /opt/rocm)
-message(STATUS "Build with HIP ${hip_VERSION}")
-target_flags(HIP_COMPILER_FLAGS hip::device)
-# Remove cuda arch flags
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
-set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
-set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
-option(ENABLE_DEBUG "Build to enable debugging" ON)
-if(ENABLE_DEBUG)
-    set(OLC_DEBUG 1)
-else()
-    set(OLC_DEBUG 0)
-endif()
-configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-)
-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
-set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
-add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
-file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
-file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
-set(MCONV_KERNEL_INCLUDES
-    ${COMPOSABLE_KERNEL_INCLUDE_1}
-    ${COMPOSABLE_KERNEL_INCLUDE_2}
-   )
-file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
-add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
-add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
-set(ONLINE_COMPILATION_SOURCE
-     ${PROJECT_BINARY_DIR}/kernel.cpp
-     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
-)
-include_directories(BEFORE
-    ${PROJECT_BINARY_DIR}/host/online_compile/include
-    include
-)
-set(OLC_HIP_UTILITY_CPPS
-    hip_utility/logger.cpp
-    hip_utility/tmp_dir.cpp
-    hip_utility/md5.cpp  
-    hip_utility/exec_utils.cpp
-    hip_utility/target_properties.cpp  
-    hip_utility/handlehip.cpp
-    hip_utility/kernel_build_params.cpp  
-    hip_utility/hip_build_utils.cpp  
-    hip_utility/hipoc_program.cpp  
-    hip_utility/hipoc_kernel.cpp  
-    hip_utility/kernel_cache.cpp  
-    hip_utility/binary_cache.cpp
-   )
-list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
-## addkernels provide the tool to create inlined kernels in one header
-add_subdirectory(addkernels)
-function(inline_kernels_src KERNELS KERNEL_INCLUDES)
-    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
-    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
-    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
-    add_custom_command(
-        OUTPUT ${KERNEL_SRC_HPP_PATH}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
-        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
-	COMMENT "Inlining All kernels"
-    )
-    configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
-    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
-    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
-endfunction()
-inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
-list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
-add_custom_command(
-    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
-    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
-    COMMENT "Inlining HIP kernel includes"
-  )
-## the library target
-add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) 
-target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
-target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
-target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
-target_link_libraries(online_compile PRIVATE hip::device)
-target_link_libraries(online_compile INTERFACE hip::host)
-target_link_libraries(online_compile PRIVATE Boost::filesystem)
-target_compile_features(online_compile PUBLIC)
-set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS online_compile LIBRARY DESTINATION lib)