Merge pull request #16 from ROCmSoftwarePlatform/develop

Merge develop into master

Merge pull request #16 from ROCmSoftwarePlatform/develop
Merge develop into master
31b40352 · Chao Liu · GitHub · 5781adf5 · b62bf8c3 · 5781adf5
Unverified Commit 31b40352 authored Aug 18, 2021 by Chao Liu Committed by GitHub Aug 18, 2021
5 changed files
--- a/host/online_compile/kernels_batch.cpp.in
+++ b/host/online_compile/kernels_batch.cpp.in
-#include "${KERNEL_SRC_HPP_FILENAME}"
--- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -2,136 +2,150 @@
 #define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP

 #include <numeric>
+#include <sstream>

-namespace ck_driver {
+namespace ck {
+namespace driver {

 struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
 {
-    ck::DataTypeEnum_t ABDataTypeEnum;
-    ck::DataTypeEnum_t AccDataTypeEnum;
-    ck::DataTypeEnum_t CDataTypeEnum;
+    auto GetCompileParameterString() const
+    {
+        auto param = std::stringstream();

-    int BlockSize;
+        // clang-format off
+        param <<
+            " -DCK_PARAM_ABDataTypeEnum=" << 
+                ABDataTypeEnum <<
+            " -DCK_PARAM_AccDataTypeEnum=" << 
+                AccDataTypeEnum <<
+            " -DCK_PARAM_CDataTypeEnum=" << 
+                CDataTypeEnum <<
+            " -DCK_PARAM_BlockSize=" << 
+                BlockSize <<
+            " -DCK_PARAM_GN0=" << 
+                GN0 <<
+            " -DCK_PARAM_GK1=" << 
+                GK1 <<
+            " -DCK_PARAM_GM1PerBlockGM11=" 
+                << GM1PerBlockGM11 <<
+            " -DCK_PARAM_GN1PerBlockGN11=" <<
+                GN1PerBlockGN11 <<
+            " -DCK_PARAM_GK0PerBlock=" <<
+                GK0PerBlock <<
+            " -DCK_PARAM_BM1PerThreadBM11=" <<
+                BM1PerThreadBM11 <<
+            " -DCK_PARAM_BN1PerThreadBN11=" <<
+                BN1PerThreadBN11 <<
+            " -DCK_PARAM_BK0PerThread=" <<
+                BK0PerThread <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" <<
+                BM10BN10ThreadClusterBM10Xs[0] << "," <<
+                BM10BN10ThreadClusterBM10Xs[1] <<
+            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" <<
+                BM10BN10ThreadClusterBN10Xs[0] << "," <<
+                BM10BN10ThreadClusterBN10Xs[1] <<
+            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4] << 
+            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3] << "," <<
+                ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4] <<
+            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3] << "," <<
+                BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4] << 
+            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" <<
+                CThreadTransferDstScalarPerVector <<
+            " -DCK_PARAM_HasMainKBlockLoop=" <<
+                static_cast<int>(HasMainKBlockLoop) <<
+            " -DCK_PARAM_HasDoubleTailKBlockLoop=" <<
+                static_cast<int>(HasDoubleTailKBlockLoop);
+        // clang-format on

-    int GN0;
-    int GK1;
+        return param.str();
+    }

-    int GM1PerBlockGM11;
-    int GN1PerBlockGN11;
-    int GK0PerBlock;
+    ck::DataTypeEnum_t ABDataTypeEnum  = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t AccDataTypeEnum = ck::DataTypeEnum_t::Unknown;
+    ck::DataTypeEnum_t CDataTypeEnum   = ck::DataTypeEnum_t::Unknown;

-    int BM1PerThreadBM11;
-    int BN1PerThreadBN11;
-    int BK0PerThread;
+    int BlockSize = -1;

-    std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
-    std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
+    int GN0 = -1;
+    int GK1 = -1;

-    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
-    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
+    int GM1PerBlockGM11 = -1;
+    int GN1PerBlockGN11 = -1;
+    int GK0PerBlock     = -1;

-    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
-    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
+    int BM1PerThreadBM11 = -1;
+    int BN1PerThreadBN11 = -1;
+    int BK0PerThread     = -1;

-    int CThreadTransferDstScalarPerVector;
+    std::array<int, 2> BM10BN10ThreadClusterBM10Xs = {-1, -1};
+    std::array<int, 2> BM10BN10ThreadClusterBN10Xs = {-1, -1};

-    bool HasMainKBlockLoop;
-    bool HasDoubleTailKBlockLoop;
+    std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
+        -1, -1, -1, -1, -1};

-    auto GetCompileParameterString() const
-    {
-        // clang-format off
-        return
-            " -DCK_PARAM_ABDataTypeEnum=" + 
-                std::to_string(ABDataTypeEnum) + 
-            " -DCK_PARAM_AccDataTypeEnum=" + 
-                std::to_string(AccDataTypeEnum) +
-            " -DCK_PARAM_CDataTypeEnum=" + 
-                std::to_string(CDataTypeEnum) + 
-            " -DCK_PARAM_BlockSize=" +
-                std::to_string(BlockSize) +
-            " -DCK_PARAM_GN0=" +
-                std::to_string(GN0) +
-            " -DCK_PARAM_GK1=" +
-                std::to_string(GK1) +
-            " -DCK_PARAM_GM1PerBlockGM11=" +
-                std::to_string(GM1PerBlockGM11) +
-            " -DCK_PARAM_GN1PerBlockGN11=" +
-                std::to_string(GN1PerBlockGN11) +
-            " -DCK_PARAM_GK0PerBlock=" + 
-                std::to_string(GK0PerBlock) +
-            " -DCK_PARAM_BM1PerThreadBM11=" +
-                std::to_string(BM1PerThreadBM11) +
-            " -DCK_PARAM_BN1PerThreadBN11=" +
-                std::to_string(BN1PerThreadBN11) +
-            " -DCK_PARAM_BK0PerThread=" +
-                std::to_string(BK0PerThread) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
-            " -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
-                std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
-            " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +  "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
-                std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
-                std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
-            " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
-                std::to_string(CThreadTransferDstScalarPerVector) +
-            " -DCK_PARAM_HasMainKBlockLoop=" +
-                std::to_string(HasMainKBlockLoop) + 
-            " -DCK_PARAM_HasDoubleTailKBlockLoop=" +
-                std::to_string(HasDoubleTailKBlockLoop);
-        // clang-format on
-    }
+    std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+    std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
+        -1, -1, -1, -1, -1};
+
+    int CThreadTransferDstScalarPerVector = -1;
+
+    bool HasMainKBlockLoop       = false;
+    bool HasDoubleTailKBlockLoop = false;
 };

 struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
@@ -229,8 +243,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
    CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
                                            const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
    {
-        using namespace ck;
-
        const int C  = conv_problem_desc.C;
        const int Y  = conv_problem_desc.Y;
        const int X  = conv_problem_desc.X;
@@ -247,12 +259,17 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw

        DataTypeEnum_t AccDataTypeEnum;

-        switch(ABDataTypeEnum)
+        if(ABDataTypeEnum == DataTypeEnum_t::Float || ABDataTypeEnum == DataTypeEnum_t::Half)
+        {
+            AccDataTypeEnum = DataTypeEnum_t::Float;
+        }
+        else if(ABDataTypeEnum == DataTypeEnum_t::Int8)
        {
-        case DataTypeEnum_t::Float:
-        case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
-        case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
-        default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
+            AccDataTypeEnum = DataTypeEnum_t::Int32;
+        }
+        else
+        {
+            return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
        }

        const int BlockSize = tunable.BlockSize;
@@ -342,7 +359,7 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
    {
        for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
        {
-            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
+            CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param{};
            bool found = false;

            std::tie(compile_param, found) =
@@ -368,8 +385,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
    IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
                            const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
    {
-        using namespace ck;
-
        const int N  = conv_problem_desc.N;
        const int K  = conv_problem_desc.K;
        const int C  = conv_problem_desc.C;
@@ -669,5 +684,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
    }
 };

-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
--- a/host/solver/include/convolution_problem_descriptor.hpp
+++ b/host/solver/include/convolution_problem_descriptor.hpp
 #ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
 #define CONVOLUTION_PROBLEM_DESCRIPTOR

-namespace ck_driver {
+namespace ck {
+namespace driver {

 struct ConvolutionProblemDescriptor
 {
@@ -75,5 +76,6 @@ struct ConvolutionProblemDescriptor
    std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
 };

-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
--- a/host/driver_online/include/online_driver_common.hpp
+++ b/host/driver_online/include/online_driver_common.hpp
-#ifndef ONLINE_DRIVER_COMMON_HPP
-#define ONLINE_DRIVER_COMMON_HPP
+#ifndef CK_SOLVER_COMMON_HPP
+#define CK_SOLVER_COMMON_HPP

-namespace ck_driver {
-
-inline auto get_ck_hip_online_compile_common_flag()
-{
-    std::string param = " -std=c++17";
-
-    return param;
-}
+namespace ck {
+namespace driver {

 // greatest common divisor, aka highest common factor
 inline int gcd(int x, int y)
@@ -47,5 +41,6 @@ auto gcd(X x, Ys... ys)
    return gcd(x, gcd(ys...));
 }

-} // namespace ck_driver
+} // namespace driver
+} // namespace ck
 #endif
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,40 +3,16 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=../../../
+MY_PROJECT_SOURCE=../../..
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
+-D HALF_INCLUDE_DIR="/root/workspace/external/half/include"                                                                                    \
+-D BUILD_DEV=ON                                                                                                                                \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
-D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906"                                                                                             \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
 ${MY_PROJECT_SOURCE}
-
-#CXX_FLAG_TMP=-Weverything
-#            -Wno-c++98-compat \
-#            -Wno-c++98-compat-pedantic \
-#            -Wno-conversion \
-#            -Wno-double-promotion \
-#            -Wno-exit-time-destructors \
-#            -Wno-extra-semi \
-#            -Wno-float-conversion \
-#            -Wno-gnu-anonymous-struct \
-#            -Wno-gnu-zero-variadic-macro-arguments \
-#            -Wno-missing-noreturn \
-#            -Wno-missing-prototypes \
-#            -Wno-nested-anon-types \
-#            -Wno-padded \
-#            -Wno-return-std-move-in-c++11 \
-#            -Wno-shorten-64-to-32 \
-#            -Wno-sign-conversion \
-#            -Wno-unknown-warning-option \
-#            -Wno-unused-command-line-argument \
-#            -Wno-weak-vtables \
-#            -Wno-covered-switch-default \
-#            -Wno-disabled-macro-expansion \
-#            -Wno-undefined-reinterpret-cast
-