[MIOpen Downstream] Initial MIOpen integration (#52)

* update online kernel wrapper bundle all descriptors in a tuple * change __CONSTANT__ to CONSTANT * rename * adding tuning * added IsValidCompileParameter * reorginze * adding tunable for fp16 and int8 * fix kernel compile warning and bug fixes * suppress warning about cast CONSTANT (address space 4) pointer * fix building issue

[MIOpen Downstream] Initial MIOpen integration (#52)
* update online kernel wrapper bundle all descriptors in a tuple * change __CONSTANT__ to CONSTANT * rename * adding tuning * added IsValidCompileParameter * reorginze * adding tunable for fp16 and int8 * fix kernel compile warning and bug fixes * suppress warning about cast CONSTANT (address space 4) pointer * fix building issue
f63a23ac · Chao Liu · GitHub · 12649254 · f63a23ac · f63a23ac
Unverified Commit f63a23ac authored Jul 27, 2021 by Chao Liu Committed by GitHub Jul 27, 2021
15 changed files
--- a/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
+#include <numeric>
+namespace ck_driver {
+struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    ck::DataTypeEnum_t ABDataTypeEnum;
+    ck::DataTypeEnum_t AccDataTypeEnum;
+    ck::DataTypeEnum_t CDataTypeEnum;
+    int BlockSize;
+    int GN0;
+    int GK1;
+    int GM1PerBlockGM11;
+    int GN1PerBlockGN11;
+    int GK0PerBlock;
+    int BM1PerThreadBM11;
+    int BN1PerThreadBN11;
+    int BK0PerThread;
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    int CThreadTransferDstScalarPerVector;
+    bool HasMainKBlockLoop;
+    bool HasDoubleTailKBlockLoop;
+    auto GetCompileParameterString() const
+    {
+        // clang-format off
+        return
+            " -DCK_PARAM_ABDataTypeEnum=" + 
+                std::to_string(ABDataTypeEnum) + 
+            " -DCK_PARAM_AccDataTypeEnum=" + 
+                std::to_string(AccDataTypeEnum) +
+            " -DCK_PARAM_CDataTypeEnum=" + 
+                std::to_string(CDataTypeEnum) + 
+            " -DCK_PARAM_BlockSize=" +
+                std::to_string(BlockSize) +
+            " -DCK_PARAM_GN0=" +
+                std::to_string(GN0) +
+            " -DCK_PARAM_GK1=" +
+                std::to_string(GK1) +
+            " -DCK_PARAM_GM1PerBlockGM11=" +
+                std::to_string(GM1PerBlockGM11) +
+            " -DCK_PARAM_GN1PerBlockGN11=" +
+                std::to_string(GN1PerBlockGN11) +
+            " -DCK_PARAM_GK0PerBlock=" + 
+                std::to_string(GK0PerBlock) +
+            " -DCK_PARAM_BM1PerThreadBM11=" +
+                std::to_string(BM1PerThreadBM11) +
+            " -DCK_PARAM_BN1PerThreadBN11=" +
+                std::to_string(BN1PerThreadBN11) +
+            " -DCK_PARAM_BK0PerThread=" +
+                std::to_string(BK0PerThread) +
+            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
+                std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
+                std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
+            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
+                std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
+                std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
+            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +  "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
+                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
+                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
+            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+                std::to_string(CThreadTransferDstScalarPerVector) +
+            " -DCK_PARAM_HasMainKBlockLoop=" +
+                std::to_string(HasMainKBlockLoop) + 
+            " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
+                std::to_string(HasDoubleTailKBlockLoop);
+        // clang-format on
+    }
+};
+struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    ck::DataTypeEnum_t ABDataTypeEnum;
+    ck::DataTypeEnum_t CDataTypeEnum;
+    int BlockSize;
+    int GN0;
+    int GK1;
+    int GM1PerBlockGM11;
+    int GN1PerBlockGN11;
+    int GK0PerBlock;
+    int BM1PerThreadBM11;
+    int BN1PerThreadBN11;
+    int BK0PerThread;
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+};
+inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
+{
+    constexpr auto f32 = ck::DataTypeEnum_t::Float;
+    constexpr auto f16 = ck::DataTypeEnum_t::Half;
+    constexpr auto i8  = ck::DataTypeEnum_t::Int8;
+    return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
+        // clang-format off
+        // fp32
+        {f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+        {f32, f32, 256, 1, 1, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 256, 2, 1, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 256, 4, 1, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 256, 8, 1, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f32, f32, 128, 1, 1,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        // fp16
+        {f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+        {f16, f16, 256, 1, 2, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 256, 2, 2, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 256, 4, 2, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 256, 8, 2, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        {f16, f16, 128, 1, 2,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        // i8
+        { i8,  i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
+        { i8,  i8, 256, 1, 4, 128, 128,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 256, 2, 4, 128,  64,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1,  64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 256, 4, 4, 128,  32,  8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1,  32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 256, 8, 4, 128,  16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1,  16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+        { i8,  i8, 128, 1, 4,  64, 128,  8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1,  64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
+        // clang-format on
+    };
+}
+// TODO make this common interface and write specs for it
+struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
+{
+    static auto
+    CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
+                                            const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
+    {
+        using namespace ck;
+        const int C  = conv_problem_desc.C;
+        const int Y  = conv_problem_desc.Y;
+        const int X  = conv_problem_desc.X;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+        if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
+             conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
+             conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+        const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
+        const auto CDataTypeEnum  = conv_problem_desc.OutDataTypeEnum;
+        DataTypeEnum_t AccDataTypeEnum;
+        switch(ABDataTypeEnum)
+        {
+        case DataTypeEnum_t::Float:
+        case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
+        case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
+        default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+        }
+        const int BlockSize = tunable.BlockSize;
+        const int GN0 = tunable.GN0;
+        const int GK1 = tunable.GK1;
+        const int GM11        = tunable.GM1PerBlockGM11;
+        const int GN11        = tunable.GN1PerBlockGN11;
+        const int GK0PerBlock = tunable.GK0PerBlock;
+        const int BM11         = tunable.BM1PerThreadBM11;
+        const int BN11         = tunable.BN1PerThreadBN11;
+        const int BK0PerThread = tunable.BK0PerThread;
+        const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
+        const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
+        const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
+            tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+        const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+        const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
+            tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+        // C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
+        const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
+        const int C0 = GK1;
+        if(!(C % C0 == 0))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+        const int C1 = C / C0;
+        const int GK0 = C1 * Y * X;
+        if(!(GK0 % GK0PerBlock == 0))
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+        const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
+        const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
+        return std::make_tuple(
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
+                ABDataTypeEnum,
+                AccDataTypeEnum,
+                CDataTypeEnum,
+                BlockSize,
+                GN0,
+                GK1,
+                GM11,
+                GN11,
+                GK0PerBlock,
+                BM11,
+                BN11,
+                BK0PerThread,
+                BM10BN10ThreadClusterBM10Xs,
+                BM10BN10ThreadClusterBN10Xs,
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+                CThreadTransferDstScalarPerVector,
+                HasMainKBlockLoop,
+                HasDoubleTailKBlockLoop},
+            true);
+    }
+    static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
+    {
+        for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
+        {
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
+            bool found = false;
+            std::tie(compile_param, found) =
+                CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
+            if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
+                return std::make_tuple(compile_param, true);
+        }
+        return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+    }
+    static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
+    {
+        bool found = false;
+        std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
+        return found;
+    }
+    static bool
+    IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
+                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        using namespace ck;
+        const int N  = conv_problem_desc.N;
+        const int K  = conv_problem_desc.K;
+        const int C  = conv_problem_desc.C;
+        const int Y  = conv_problem_desc.Y;
+        const int X  = conv_problem_desc.X;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+        const int GK1  = compile_param.GK1;
+        const int GN0  = compile_param.GN0;
+        const int GM11 = compile_param.GM1PerBlockGM11;
+        const int GN11 = compile_param.GN1PerBlockGN11;
+        const int BM11 = compile_param.BM1PerThreadBM11;
+        const int BN11 = compile_param.BN1PerThreadBN11;
+        const int C0 = GK1;
+        const int N0 = GN0;
+        if(!(C % C0 == 0))
+            return false;
+        const int C1 = C / C0;
+        if(!(N % N0 == 0))
+            return false;
+        const int N1 = N / N0;
+        const int GM0 = 1;
+        const int GM1 = K;
+        const int GN1 = N1 * Ho * Wo;
+        const int GK0 = C1 * Y * X;
+        // check data type
+        {
+            if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
+                 conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
+                return false;
+            if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
+               compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
+            {
+                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
+                    return false;
+            }
+            else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
+            {
+                if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
+                    return false;
+            }
+        }
+        // check gridwise contraction
+        {
+            if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
+                return false;
+            const bool has_main_k_block_loop =
+                ((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
+            const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
+            if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
+                 has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
+                return false;
+        }
+        // check A blockwise copy
+        {
+            const auto block_slice_lengths =
+                std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
+            const auto& cluster_lengths =
+                compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& thread_slice_lengths =
+                compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& src_vector_lengths =
+                compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+            const auto& dst_vector_lengths =
+                compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+            // check number of working thread
+            const int num_work_thread = std::accumulate(
+                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
+            if(!(compile_param.BlockSize >= num_work_thread))
+                return false;
+            // check block slice lengths vs thread slice lengths vs cluster lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
+                    return false;
+            }
+            // check thread slice lengths vs vector lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
+                    return false;
+                if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
+                    return false;
+            }
+            // check Src vectorization, GK0 is global mem vector dim
+            if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
+                 src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
+                return false;
+            // check Dst vectorization, {GM11, GK1} are LDS vector dims
+            if(dst_vector_lengths[4] == GK1)
+            { // vectorize on {GM11, GK1}
+                if(!(GM11 % dst_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            { // vectorize on {GK1} only
+                if(!(GK1 % dst_vector_lengths[4] == 0))
+                    return false;
+                if(!(dst_vector_lengths[3] == 1))
+                    return false;
+            }
+        }
+        // check B blockwise copy
+        {
+            const auto block_slice_lengths =
+                std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
+            const auto& cluster_lengths =
+                compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& thread_slice_lengths =
+                compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& src_vector_lengths =
+                compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+            const auto& dst_vector_lengths =
+                compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+            // check number of working thread
+            const int num_work_thread = std::accumulate(
+                cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
+            if(!(compile_param.BlockSize >= num_work_thread))
+                return false;
+            // check block slice lengths vs thread slice lengths vs cluster lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
+                    return false;
+            }
+            // check thread slice lengths vs vector lengths
+            for(int i = 0; i < 5; ++i)
+            {
+                if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
+                     thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
+                    return false;
+            }
+            // check Src vectorization: {GN11} is global mem vector dim
+            if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
+                 src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
+                return false;
+            // check Src tensor layout related vectorization
+            if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
+               conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
+               conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
+               conv_problem_desc.InRightPadW == 0)
+            {
+                if(!((Ho * Wo) % src_vector_lengths[3] == 0))
+                    return false;
+            }
+            else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
+                    conv_problem_desc.InRightPadW == 0)
+            {
+                if(!(Wo % src_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            {
+                if(!(src_vector_lengths[3] == 1))
+                    return false;
+            }
+            // check Dst vectorization: {GN11, GK1} are LDS vector dims
+            if(dst_vector_lengths[4] == GK1)
+            { // vectorize on {GN11, GK1}
+                if(!(GN11 % dst_vector_lengths[3] == 0))
+                    return false;
+            }
+            else
+            { // vectorize on {GK1} only
+                if(!(dst_vector_lengths[3] == 1))
+                    return false;
+                if(!(GK1 % dst_vector_lengths[4] == 0))
+                    return false;
+            }
+        }
+        // check blockwise GEMM
+        {
+            const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
+                                             compile_param.BM10BN10ThreadClusterBM10Xs.end(),
+                                             1,
+                                             std::multiplies<int>{});
+            const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
+                                             compile_param.BM10BN10ThreadClusterBN10Xs.end(),
+                                             1,
+                                             std::multiplies<int>{});
+            if(!(compile_param.BlockSize == BM10 * BN10))
+                return false;
+            const int BM = GM0 * GM11;
+            const int BN = GN0 * GN11;
+            const int BM1 = BM10 * BM11;
+            const int BN1 = BN10 * BN11;
+            if(!(BM % BM1 == 0 && BN % BN1 == 0))
+                return false;
+            const int BM0 = BM / BM1;
+            const int BN0 = BN / BN1;
+            // blockwise GEMM currently only support BM0 == 2 && BN0 == 2
+            if(!(BM0 == 2 && BN0 == 2))
+                return false;
+            if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
+                return false;
+        }
+        // check C threadwise copy
+        {
+            // {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
+            const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
+            // check slice length vs Dst vector length:
+            if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
+                return false;
+            // check Dst memory layout related vectorization:
+            if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
+                return false;
+        }
+        return true;
+    };
+    static int GetBlockSize(const ConvolutionProblemDescriptor&,
+                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        return compile_param.BlockSize;
+    }
+    static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
+                           const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
+    {
+        const int N  = conv_problem_desc.N;
+        const int K  = conv_problem_desc.K;
+        const int Ho = conv_problem_desc.Ho;
+        const int Wo = conv_problem_desc.Wo;
+        const int N0 = compile_param.GN0;
+        const int N1 = N / N0;
+        const int GM1 = K;
+        const int GN1 = N1 * Ho * Wo;
+        const int GM11 = compile_param.GM1PerBlockGM11;
+        const int GN11 = compile_param.GN1PerBlockGN11;
+        const int GM10 = GM1 / GM11;
+        const int GN10 = GN1 / GN11;
+        return GM10 * GN10;
+    }
+    static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
+                                        const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
+    {
+        // workspace is used for save transformed tensor descritpors created by prepare kernel
+        return 4096L;
+    }
+    static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
+    static auto GetTunableList()
+    {
+        return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
+    }
+};
+} // namespace ck_driver
+#endif
--- a/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp
+#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
+#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
+struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
+{
+    int BlockSize;
+    int MPerBlock;
+    int NPerBlock;
+    int KPerBlock;
+    int M1PerThread;
+    int N1PerThread;
+    int KPerThread;
+    int M1N1ThreadClusterM10;
+    int M1N1ThreadClusterN10;
+    int M1N1ThreadClusterM11;
+    int M1N1ThreadClusterN11;
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
+    int ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcScalarPerVector;
+    int ABlockTransferDstScalarPerVector_M1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
+    int BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcScalarPerVector;
+    int BBlockTransferDstScalarPerVector_N1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+    std::array<int, 6> CThreadTransferSrcDstAccessOrder;
+    int CThreadTransferSrcDstVectorDim;
+    int CThreadTransferDstScalarPerVector;
+};
+static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
+    default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
+        256,       128,       128, 8, 4,         4,           1,
+        8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
+        {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
+        {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
+        5,         1};
+#endif
--- a/host/driver_online/include/conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
-struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
-{
-    int32_t BlockSize;
-    int32_t MPerBlock;
-    int32_t NPerBlock;
-    int32_t KPerBlock;
-    int32_t M1PerThread;
-    int32_t N1PerThread;
-    int32_t KPerThread;
-    int32_t M1N1ThreadClusterM10;
-    int32_t M1N1ThreadClusterN10;
-    int32_t M1N1ThreadClusterM11;
-    int32_t M1N1ThreadClusterN11;
-    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
-    int32_t ABlockTransferSrcVectorDim;
-    int32_t ABlockTransferSrcScalarPerVector;
-    int32_t ABlockTransferDstScalarPerVector_M1;
-    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
-    int32_t BBlockTransferSrcVectorDim;
-    int32_t BBlockTransferSrcScalarPerVector;
-    int32_t BBlockTransferDstScalarPerVector_N1;
-    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 6> CThreadTransferSrcDstAccessOrder;
-    int32_t CThreadTransferSrcDstVectorDim;
-    int32_t CThreadTransferDstScalarPerVector;
-};
-static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
-    256,       128,       128, 8, 4,         4,           1,
-    8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
-    {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
-    {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
-    5,         1};
-#endif
--- a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -3,40 +3,40 @@
 struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
 {
-    int32_t BlockSize;
+    int BlockSize;
-    int32_t MPerBlock;
+    int MPerBlock;
-    int32_t NPerBlock;
+    int NPerBlock;
-    int32_t KPerBlock;
+    int KPerBlock;
-    int32_t MPerWave;
+    int MPerWave;
-    int32_t NPerWave;
+    int NPerWave;
-    int32_t K1;
+    int K1;
-    int32_t MRepeat;
+    int MRepeat;
-    int32_t NRepeat;
+    int NRepeat;
-    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int32_t ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcVectorDim;
-    int32_t ABlockTransferSrcScalarPerVector;
+    int ABlockTransferSrcScalarPerVector;
-    int32_t ABlockTransferDstScalarPerVector_K1;
+    int ABlockTransferDstScalarPerVector_K1;
    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int32_t BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcVectorDim;
-    int32_t BBlockTransferSrcScalarPerVector;
+    int BBlockTransferSrcScalarPerVector;
-    int32_t BBlockTransferDstScalarPerVector_K1;
+    int BBlockTransferDstScalarPerVector_K1;
    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
+    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int32_t CThreadTransferSrcDstVectorDim;
+    int CThreadTransferSrcDstVectorDim;
-    int32_t CThreadTransferDstScalarPerVector;
+    int CThreadTransferDstScalarPerVector;
 };
 static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw

--- a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -3,40 +3,40 @@
 struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
 {
-    int32_t BlockSize;
+    int BlockSize;
-    int32_t MPerBlock;
+    int MPerBlock;
-    int32_t NPerBlock;
+    int NPerBlock;
-    int32_t KPerBlock;
+    int KPerBlock;
-    int32_t MPerWave;
+    int MPerWave;
-    int32_t NPerWave;
+    int NPerWave;
-    int32_t K1;
+    int K1;
-    int32_t MRepeat;
+    int MRepeat;
-    int32_t NRepeat;
+    int NRepeat;
-    std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
+    std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
-    std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
+    std::array<int, 3> ABlockTransferSrcAccessOrder;
-    int32_t ABlockTransferSrcVectorDim;
+    int ABlockTransferSrcVectorDim;
-    int32_t ABlockTransferSrcScalarPerVector;
+    int ABlockTransferSrcScalarPerVector;
-    int32_t ABlockTransferDstScalarPerVector_K1;
+    int ABlockTransferDstScalarPerVector_K1;
    bool AThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
+    std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
-    std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
-    std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
+    std::array<int, 3> BBlockTransferSrcAccessOrder;
-    int32_t BBlockTransferSrcVectorDim;
+    int BBlockTransferSrcVectorDim;
-    int32_t BBlockTransferSrcScalarPerVector;
+    int BBlockTransferSrcScalarPerVector;
-    int32_t BBlockTransferDstScalarPerVector_K1;
+    int BBlockTransferDstScalarPerVector_K1;
    bool BThreadTransferSrcResetCoordinateAfterRun;
-    std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder;
+    std::array<int, 8> CThreadTransferSrcDstAccessOrder;
-    int32_t CThreadTransferSrcDstVectorDim;
+    int CThreadTransferSrcDstVectorDim;
-    int32_t CThreadTransferDstScalarPerVector;
+    int CThreadTransferDstScalarPerVector;
 };
 static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk

--- a/host/driver_online/include/conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp
-#ifndef CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
-#define CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
-struct tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
-{
-    int32_t BlockSize = 256;
-    int32_t GN0 = 4;
-    int32_t GK1 = 1;
-    int32_t GM1PerBlockGM11 = 128;
-    int32_t GN1PerBlockGN11 = 32;
-    int32_t GK0PerBlock     = 8;
-    int32_t BM1PerThreadBM11 = 4;
-    int32_t BN1PerThreadBN11 = 4;
-    int32_t BK0PerThread     = 1;
-    int32_t BM10BN10ThreadClusterBM100 = 2;
-    int32_t BM10BN10ThreadClusterBN100 = 2;
-    int32_t BM10BN10ThreadClusterBM101 = 8;
-    int32_t BM10BN10ThreadClusterBN101 = 8;
-    std::array<int32_t, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {4, 1, 1, 1, 1};
-    std::array<int32_t, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
-        2, 1, 1, 128, 1};
-    std::array<int32_t, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        4, 1, 1, 1, 1};
-    std::array<int32_t, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
-        1, 1, 1, 1, 1};
-    std::array<int32_t, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {1, 4, 1, 1, 1};
-    std::array<int32_t, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
-        8, 1, 1, 32, 1};
-    std::array<int32_t, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        1, 1, 1, 1, 1};
-    std::array<int32_t, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
-        1, 1, 1, 1, 1};
-    int32_t CThreadTransferDstScalarPerVector = 1;
-};
-#endif
--- a/host/driver_online/include/convolution_problem_descriptor.hpp
+++ b/host/driver_online/include/convolution_problem_descriptor.hpp
+#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
+#define CONVOLUTION_PROBLEM_DESCRIPTOR
+namespace ck_driver {
+struct ConvolutionProblemDescriptor
+{
+    ConvolutionProblemDescriptor() = default;
+    ConvolutionProblemDescriptor(int N_,
+                                 int K_,
+                                 int C_,
+                                 int Y_,
+                                 int X_,
+                                 int Hi_,
+                                 int Wi_,
+                                 int Ho_,
+                                 int Wo_,
+                                 int ConvStrideH_,
+                                 int ConvStrideW_,
+                                 int ConvDilationH_,
+                                 int ConvDilationW_,
+                                 int InLeftPadH_,
+                                 int InLeftPadW_,
+                                 int InRightPadH_,
+                                 int InRightPadW_,
+                                 ck::DataTypeEnum_t InDataTypeEnum_,
+                                 ck::DataTypeEnum_t WeiDataTypeEnum_,
+                                 ck::DataTypeEnum_t OutDataTypeEnum_)
+        : N{N_},
+          K{K_},
+          C{C_},
+          Y{Y_},
+          X{X_},
+          Hi{Hi_},
+          Wi{Wi_},
+          Ho{Ho_},
+          Wo{Wo_},
+          ConvStrideH{ConvStrideH_},
+          ConvStrideW{ConvStrideW_},
+          ConvDilationH{ConvDilationH_},
+          ConvDilationW{ConvDilationW_},
+          InLeftPadH{InLeftPadH_},
+          InLeftPadW{InLeftPadW_},
+          InRightPadH{InRightPadH_},
+          InRightPadW{InRightPadW_},
+          InDataTypeEnum{InDataTypeEnum_},
+          WeiDataTypeEnum{WeiDataTypeEnum_},
+          OutDataTypeEnum{OutDataTypeEnum_}
+    {
+    }
+    int N;
+    int K;
+    int C;
+    int Y;
+    int X;
+    int Hi;
+    int Wi;
+    int Ho;
+    int Wo;
+    int ConvStrideH;
+    int ConvStrideW;
+    int ConvDilationH;
+    int ConvDilationW;
+    int InLeftPadH;
+    int InLeftPadW;
+    int InRightPadH;
+    int InRightPadW;
+    ck::DataTypeEnum_t InDataTypeEnum;
+    ck::DataTypeEnum_t WeiDataTypeEnum;
+    ck::DataTypeEnum_t OutDataTypeEnum;
+    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
+};
+} // namespace ck_driver
+#endif
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+#pragma once
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "handle.hpp"
@@ -5,24 +6,26 @@
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp"
+#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
 namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_network_config_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TOut>());
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
 static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
 {
    std::string out("TUN_");
@@ -95,17 +98,20 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_definition_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
+    out +=
-           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
 static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
 {
    std::string out;
@@ -209,7 +215,7 @@ template <typename TInWei,
          typename ConvDilations,
          typename InLeftPads,
          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
+void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
    olCompile::Handle* handle,
    const InLengths& in_n_c_hi_wi_lengths,
    const WeiLengths& wei_k_c_y_x_lengths,
@@ -221,10 +227,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
    const Tensor<TInWei>& in_n_c_hi_wi,
    const Tensor<TInWei>& wei_k_c_y_x,
    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable,
+    const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
    ck::index_t nrepeat)
 {
    using namespace ck;
+    using namespace ck_driver;
    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
    using size_t = std::size_t;
@@ -288,8 +295,9 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
-    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
+    std::string program_name =
-    std::string algo_name    = "implicit_gemm_conv_fwd_v4r4_nchw";
+        "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
    std::string param = " -std=c++17 ";
    std::string network_config;
@@ -311,7 +319,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
        KernelTimer timer1, timer2;
        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare";
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
        auto network_config_1 = network_config + "_1";
        timer1.Start();
@@ -337,7 +345,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
        timer1.End();
-        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw";
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
        auto network_config_2 = network_config + "_2";
        timer2.Start();
@@ -356,8 +364,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
    }
    {
-        auto ave_time1 = Driver::get_effective_average(kernel1_times);
+        auto ave_time1 =
-        auto ave_time2 = Driver::get_effective_average(kernel2_times);
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
        const auto N = in_n_c_hi_wi_lengths[I0];
        const auto C = in_n_c_hi_wi_lengths[I1];

--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -11,11 +11,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_network_config_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TOut>());
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
@@ -93,11 +95,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nc
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_definition_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
+    out +=
-           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
@@ -222,6 +227,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
    ck::index_t nrepeat)
 {
    using namespace ck;
+    using namespace ck_driver;
    using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
    using size_t = std::size_t;
@@ -349,8 +355,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
    }
    {
-        auto ave_time1 = Driver::get_effective_average(kernel1_times);
+        auto ave_time1 =
-        auto ave_time2 = Driver::get_effective_average(kernel2_times);
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
        const auto N = in_n_c_hi_wi_lengths[I0];
        const auto C = in_n_c_hi_wi_lengths[I1];

--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -12,11 +12,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_network_config_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
+    out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
+           std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
-           static_cast<char>(Driver::get_typeid_from_type<TOut>());
+           std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
@@ -94,11 +96,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nh
 template <typename TInWei, typename TAcc, typename TOut>
 static std::string get_definition_string_from_types()
 {
+    using namespace ck;
    std::string out;
-    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
+    out +=
-           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+        " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
-           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
+        " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
+        " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
    return (out);
 };
@@ -302,15 +307,16 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
    std::vector<float> kernel1_times;
    std::vector<float> kernel2_times;
-    KernelTimer timer1, timer2;
-    std::string kernel_name;
-    kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
-    auto network_config_1 = network_config + "_1";
-    timer1.Start();
    for(index_t i = 0; i < nrepeat; ++i)
    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+        kernel_name =
+            "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
+        auto network_config_1 = network_config + "_1";
+        timer1.Start();
        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
            static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
            static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
@@ -331,15 +337,12 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
            b_k0_n_k1_grid_desc_dev_buf,
            c_m0_m1_m2_n_grid_desc_dev_buf,
            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
-    }
+        timer1.End();
-    timer1.End();
-    kernel_name           = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
-    auto network_config_2 = network_config + "_2";
+        auto network_config_2 = network_config + "_2";
-    timer2.Start();
+        timer2.Start();
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
            reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
            reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
@@ -348,12 +351,21 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
            (const void*)(b_k0_n_k1_grid_desc_dev_buf),
            (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
+        timer2.End();
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
    }
-    timer2.End();
    {
-        auto ave_time1 = timer1.GetElapsedTime() / nrepeat;
+        auto ave_time1 =
-        auto ave_time2 = timer2.GetElapsedTime() / nrepeat;
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
        const auto N = in_n_hi_wi_c_lengths[I0];
        const auto C = in_n_hi_wi_c_lengths[I3];

--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+#pragma once
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "handle.hpp"
+#include "online_driver_common.hpp"
+#include "convolution_problem_descriptor.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
+#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace ck_driver;
+    using size_t = std::size_t;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
+                                                   out_n_k_ho_wo_lengths[I1],
+                                                   in_n_c_hi_wi_lengths[I1],
+                                                   wei_k_c_y_x_lengths[I2],
+                                                   wei_k_c_y_x_lengths[I3],
+                                                   in_n_c_hi_wi_lengths[I2],
+                                                   in_n_c_hi_wi_lengths[I3],
+                                                   out_n_k_ho_wo_lengths[I2],
+                                                   out_n_k_ho_wo_lengths[I3],
+                                                   conv_strides[I0],
+                                                   conv_strides[I1],
+                                                   conv_dilations[I0],
+                                                   conv_dilations[I1],
+                                                   in_left_pads[I0],
+                                                   in_left_pads[I1],
+                                                   in_right_pads[I0],
+                                                   in_right_pads[I1],
+                                                   get_datatype_enum_from_type<TInWei>::value,
+                                                   get_datatype_enum_from_type<TInWei>::value,
+                                                   get_datatype_enum_from_type<TOut>::value};
+    if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
+                                                                   compile_param))
+    {
+        throw std::runtime_error("wrong! IsValidCompileParameter fail");
+    }
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+    // workspace is used for save transformed tensor descritpors created by prepare kernel
+    DeviceMem workspace_dev_buf(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
+    const auto block_size = std::size_t(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
+    const auto grid_size = std::size_t(
+        ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
+    const std::vector<size_t> vld1 = {1, 1, 1};
+    const std::vector<size_t> vgd1 = {1, 1, 1};
+    const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
+    std::string program_name =
+        "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
+    std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
+    std::string compile_param_string = " -std=c++17 " + compile_param.GetCompileParameterString();
+    std::string network_config       = compile_param_string;
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+        timer1.Start();
+        handle->AddKernel(algo_name,
+                          network_config_1,
+                          program_name,
+                          kernel_name,
+                          vld1,
+                          vgd1,
+                          compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+                                                static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+                                                static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+                                                conv_strides[I0],
+                                                conv_strides[I1],
+                                                conv_dilations[I0],
+                                                conv_dilations[I1],
+                                                in_left_pads[I0],
+                                                in_left_pads[I1],
+                                                in_right_pads[I0],
+                                                in_right_pads[I1],
+                                                (void*)(workspace_dev_buf.GetDeviceBuffer()));
+        timer1.End();
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+        timer2.Start();
+        handle->AddKernel(algo_name,
+                          network_config_2,
+                          program_name,
+                          kernel_name,
+                          vld2,
+                          vgd2,
+                          compile_param_string)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(workspace_dev_buf.GetDeviceBuffer()));
+        timer2.End();
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+    {
+        auto ave_time1 =
+            std::accumulate(
+                std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        auto ave_time2 =
+            std::accumulate(
+                std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
+            (nrepeat - 1);
+        float perf = (float)(conv_problem_desc.CalculateFlop()) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "handle.hpp"
-#include "online_driver_common.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
-#include "conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp"
-namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw {
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_network_config_string_from_types()
-{
-    std::string out("DAT_");
-    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
-           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
-           static_cast<char>(Driver::get_typeid_from_type<TOut>());
-    return (out);
-};
-static std::string
-get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
-{
-    std::string out("TUN_");
-    out += std::to_string(tunable.BlockSize) + "_";
-    out += std::to_string(tunable.GN0) + "x" + std::to_string(tunable.GK1) + "_";
-    out += std::to_string(tunable.GM1PerBlockGM11) + "x" + std::to_string(tunable.GN1PerBlockGN11) +
-           "x" + std::to_string(tunable.GK0PerBlock) + "_";
-    out += std::to_string(tunable.BM1PerThreadBM11) + "x" +
-           std::to_string(tunable.BN1PerThreadBN11) + "x" + std::to_string(tunable.BK0PerThread) +
-           "_";
-    out += std::to_string(tunable.BM10BN10ThreadClusterBM100) + "x" +
-           std::to_string(tunable.BM10BN10ThreadClusterBN100) + "x" +
-           std::to_string(tunable.BM10BN10ThreadClusterBM101) + "x" +
-           std::to_string(tunable.BM10BN10ThreadClusterBN101) + "_";
-    out += std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
-    out +=
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
-    out += std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-           "_";
-    out += std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
-           "x" +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-           "_";
-    out += std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
-    out +=
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
-    out += std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-           "_";
-    out += std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
-           "x" +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-           "_";
-    out += std::to_string(tunable.CThreadTransferDstScalarPerVector);
-    return (out);
-};
-template <typename TInWei, typename TAcc, typename TOut>
-static std::string get_definition_string_from_types()
-{
-    std::string out;
-    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
-           " -DCK_PARAM_ACC_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
-           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
-    return (out);
-};
-static std::string
-get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
-{
-    std::string out;
-    out += " -DCK_PARAM_BlockSize=" + std::to_string(tunable.BlockSize);
-    out += " -DCK_PARAM_GN0=" + std::to_string(tunable.GN0);
-    out += " -DCK_PARAM_GK1=" + std::to_string(tunable.GK1);
-    out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(tunable.GM1PerBlockGM11) +
-           " -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(tunable.GN1PerBlockGN11) +
-           " -DCK_PARAM_GK0PerBlock=" + std::to_string(tunable.GK0PerBlock);
-    out += " -DCK_PARAM_BM1PerThreadBM11=" + std::to_string(tunable.BM1PerThreadBM11) +
-           " -DCK_PARAM_BN1PerThreadBN11=" + std::to_string(tunable.BN1PerThreadBN11) +
-           " -DCK_PARAM_BK0PerThread=" + std::to_string(tunable.BK0PerThread);
-    out += " -DCK_PARAM_BM10BN10ThreadClusterBM100=" +
-           std::to_string(tunable.BM10BN10ThreadClusterBM100) +
-           " -DCK_PARAM_BM10BN10ThreadClusterBN100=" +
-           std::to_string(tunable.BM10BN10ThreadClusterBN100) +
-           " -DCK_PARAM_BM10BN10ThreadClusterBM101=" +
-           std::to_string(tunable.BM10BN10ThreadClusterBM101) +
-           " -DCK_PARAM_BM10BN10ThreadClusterBN101=" +
-           std::to_string(tunable.BM10BN10ThreadClusterBN101);
-    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-           std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]);
-    out +=
-        " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-        std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]);
-    out += " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
-           "," +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
-           "," +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
-           "," +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
-           "," +
-           std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
-    out += " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
-           "," +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
-           "," +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
-           "," +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
-           "," +
-           std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
-    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-           std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]);
-    out +=
-        " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-        std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]);
-    out += " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
-           "," +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
-           "," +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
-           "," +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
-           "," +
-           std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
-    out += " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
-           "," +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
-           "," +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
-           "," +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
-           "," +
-           std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
-    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-           std::to_string(tunable.CThreadTransferDstScalarPerVector);
-    return (out);
-};
-} // namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
-    olCompile::Handle* handle,
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-    using namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw;
-    using size_t = std::size_t;
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
-    // hasDoubleTailKBlockLoop
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto in_n_c_hi_wi_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
-    const auto wei_k_c_y_x_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
-    const auto out_n_k_ho_wo_desc =
-        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
-    const auto descs =
-        transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                               in_n_c_hi_wi_desc,
-                                                                               out_n_k_ho_wo_desc,
-                                                                               conv_strides,
-                                                                               conv_dilations,
-                                                                               in_left_pads,
-                                                                               in_right_pads,
-                                                                               tunable.GN0,
-                                                                               tunable.GK1);
-    const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
-    const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
-    const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
-    const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
-    const auto GK  = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
-    const index_t grid_size = (GM1 / tunable.GM1PerBlockGM11) * (GN1 / tunable.GN1PerBlockGN11);
-    const bool hasMainKBlockLoop = ((GK + tunable.GK0PerBlock) / (2 * tunable.GK0PerBlock) > 1);
-    const bool hasDoubleTailKBlockLoop = ((GK / tunable.GK0PerBlock) % 2 == 0);
-    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // these buffers are usually provided by the user application
-    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
-    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
-    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
-    // these are workspace buffers that should be expressed to the user by the corresponding
-    // workspace API
-    DeviceMem workspace_buf(4096);
-    void* a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf = workspace_buf.GetDeviceBuffer();
-    void* b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
-    void* c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
-    void* c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf =
-        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
-    const std::vector<size_t> vld  = {static_cast<size_t>(tunable.BlockSize), 1, 1};
-    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable.BlockSize), 1, 1};
-    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable.BlockSize), 1, 1};
-    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.cpp";
-    std::string algo_name    = "implicit_gemm_conv_fwd_v6r1_nchw";
-    std::string param = " -std=c++17 ";
-    std::string network_config;
-    param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
-             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
-             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
-             get_definition_string_from_tunable(tunable);
-    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
-                     std::to_string(hasDoubleTailKBlockLoop) + "_" +
-                     get_network_config_string_from_tunable(tunable);
-    std::vector<float> kernel1_times;
-    std::vector<float> kernel2_times;
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        KernelTimer timer1, timer2;
-        std::string kernel_name;
-        kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw_prepare";
-        auto network_config_1 = network_config + "_1";
-        timer1.Start();
-        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
-            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
-            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
-            conv_strides[I0],
-            conv_strides[I1],
-            conv_dilations[I0],
-            conv_dilations[I1],
-            in_left_pads[I0],
-            in_left_pads[I1],
-            in_right_pads[I0],
-            in_right_pads[I1],
-            a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf,
-            b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf,
-            c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf,
-            c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf);
-        timer2.End();
-        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw";
-        auto network_config_2 = network_config + "_2";
-        timer2.Start();
-        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
-            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
-            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
-            (const void*)(a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf),
-            (const void*)(b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf),
-            (const void*)(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf),
-            (const void*)(c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf));
-        timer2.End();
-        kernel1_times.push_back(timer1.GetElapsedTime());
-        kernel2_times.push_back(timer2.GetElapsedTime());
-    }
-    {
-        auto ave_time1 = Driver::get_effective_average(kernel1_times);
-        auto ave_time2 = Driver::get_effective_average(kernel2_times);
-        const auto N = in_n_c_hi_wi_lengths[I0];
-        const auto C = in_n_c_hi_wi_lengths[I1];
-        const auto K  = out_n_k_ho_wo_lengths[I1];
-        const auto Ho = out_n_k_ho_wo_lengths[I2];
-        const auto Wo = out_n_k_ho_wo_lengths[I3];
-        const auto Y = wei_k_c_y_x_lengths[I2];
-        const auto X = wei_k_c_y_x_lengths[I3];
-        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
-        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
-                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
-    };
-    // copy result back to host
-    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
-}
--- a/host/driver_online/include/online_driver_common.hpp
+++ b/host/driver_online/include/online_driver_common.hpp
-#ifndef OLC_DRIVER_COMMON_HPP
+#ifndef ONLINE_DRIVER_COMMON_HPP
-#define OLC_DRIVER_COMMON_HPP
+#define ONLINE_DRIVER_COMMON_HPP
-#include <half.hpp>
+namespace ck_driver {
-#include <vector>
-#include <cassert>
-// this enumerate should be synchronized with include/miopen.h
+// greatest common divisor, aka highest common factor
-typedef enum {
+inline int gcd(int x, int y)
-    appHalf     = 0,
-    appFloat    = 1,
-    appInt32    = 2,
-    appInt8     = 3,
-    appInt8x4   = 4,
-    appBFloat16 = 5,
-    appDouble   = 6,
-} appDataType_t;
-namespace Driver {
-template <appDataType_t typeNum>
-struct get_type_from_type_enum
-{
-    using type = float;
-};
-template <>
-struct get_type_from_type_enum<appHalf>
-{
-    using type = half_float::half;
-};
-template <>
-struct get_type_from_type_enum<appFloat>
-{
-    using type = float;
-};
-template <>
-struct get_type_from_type_enum<appDouble>
 {
-    using type = double;
+    if(x < 0)
-};
-template <>
-struct get_type_from_type_enum<appInt32>
-{
-    using type = int;
-};
-static inline int get_typeid_from_type_enum(appDataType_t t)
-{
-    switch(t)
    {
-    case appHalf: return (static_cast<int>('H'));
+        return gcd(-x, y);
-    case appFloat: return (static_cast<int>('F'));
+    }
-    case appBFloat16: return (static_cast<int>('B'));
+    else if(y < 0)
-    case appDouble: return (static_cast<int>('D'));
+    {
-    case appInt8:
+        return gcd(x, -y);
-    case appInt8x4:
+    }
-    case appInt32: return (static_cast<int>('O'));
+    else if(x == y || x == 0)
-    default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break;
+    {
-    };
+        return y;
-};
+    }
+    else if(y == 0)
-template <typename T>
+    {
-static inline int get_typeid_from_type()
+        return x;
-{
+    }
-    throw std::runtime_error("Unsupported typeid conversion for this type!");
+    else if(x > y)
-};
+    {
+        return gcd(x % y, y);
-template <>
+    }
-inline int get_typeid_from_type<float>()
-{
-    return (static_cast<int>('F'));
-};
-template <>
-inline int get_typeid_from_type<half_float::half>()
-{
-    return (static_cast<int>('H'));
-};
-template <>
-inline int get_typeid_from_type<double>()
-{
-    return (static_cast<int>('D'));
-};
-static inline float get_effective_average(std::vector<float>& values)
-{
-    assert(!values.empty());
-    if(values.size() == 1)
-        return (values[0]);
    else
    {
-        float sum    = 0.0f;
+        return gcd(x, y % x);
-        float maxVal = 0.0f;
+    }
+}
-        for(const auto val : values)
+template <typename X,
-        {
+          typename... Ys,
-            if(maxVal < val)
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
-                maxVal = val;
+auto gcd(X x, Ys... ys)
-            sum += val;
+{
-        };
+    return gcd(x, gcd(ys...));
+}
-        return ((sum - maxVal) / (values.size() - 1));
-    };
-};
-} // namespace Driver
+} // namespace ck_driver
 #endif
--- a/host/online_compilation/CMakeLists.txt
+++ b/host/online_compilation/CMakeLists.txt
@@ -77,6 +77,7 @@ message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
 ## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
 add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
+set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
 file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
 file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")

--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -6,21 +6,16 @@ rm -rf CMakeFiles
 MY_PROJECT_SOURCE=../../../
 MY_PROJECT_INSTALL=../install.dir
-cmake                                                                                                                              \
+cmake                                                                                                                                          \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                      \
+-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
-D CMAKE_BUILD_TYPE=Release                                                                                                        \
+-D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"           \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                          \
+-D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906"                                                                                             \
-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                     \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                  \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
 ${MY_PROJECT_SOURCE}
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0"                                               \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD"                              \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0"       \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD"       \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD"       \
 #CXX_FLAG_TMP=-Weverything
 #            -Wno-c++98-compat \
 #            -Wno-c++98-compat-pedantic \