Merge remote-tracking branch 'origin/develop' into v5r1_add

8c4e33f1 · Chao Liu · 5aed38d4 · 3737bb03 · 8c4e33f1 · 8c4e33f1
Commit 8c4e33f1 authored Nov 15, 2021 by Chao Liu
16 changed files
--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
@@ -6,22 +6,6 @@

 namespace ck {

-template <typename... Ts>
-struct is_known_at_compile_time<Tuple<Ts...>>
-{
-    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
-    {
-        return container_reduce(
-            Tuple<Ts...>{},
-            [](auto x, bool r) {
-                return is_known_at_compile_time<remove_cvref_t<decltype(x)>>::value & r;
-            },
-            true);
-    }
-
-    static constexpr bool value = IsKnownAtCompileTime();
-};
-
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
 {
@@ -29,6 +13,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
                  typename arithmetic_sequence_gen<0, N, 1>::type{});
 }

+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return tie(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
 namespace detail {

 template <typename F, typename X, index_t... Is>

--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
@@ -31,21 +31,6 @@ using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
 template <typename T>
 inline constexpr bool is_pointer_v = std::is_pointer<T>::value;

-template <typename T>
-struct is_known_at_compile_time;
-
-template <>
-struct is_known_at_compile_time<index_t>
-{
-    static constexpr bool value = false;
-};
-
-template <typename T, T X>
-struct is_known_at_compile_time<integral_constant<T, X>>
-{
-    static constexpr bool value = true;
-};
-
 template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y as_type(X x)
 {

--- a/device_operation/include/device_gemm_xdl.hpp
+++ b/device_operation/include/device_gemm_xdl.hpp
@@ -3,7 +3,6 @@

 #include <iostream>
 #include "device.hpp"
-#include "gemm_common.hpp"
 #include "device_base.hpp"
 #include "device_gemm.hpp"
 #include "common_header.hpp"

--- a/device_operation/include/gemm_common.hpp
+++ b/device_operation/include/gemm_common.hpp
-#ifndef GEMM_COMMON_HPP
-#define GEMM_COMMON_HPP
-
-enum GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-    KM_KN_MN, // 2
-    KM_NK_MN, // 3
-    MK_KN_NM, // 4
-    MK_NK_NM, // 5
-    KM_KN_NM, // 6
-    KM_NK_NM, // 7
-};
-
-enum GemmDataType
-{
-    F32_F32_F32, // 0
-    F16_F16_F16, // 1
-};
-
-#endif
--- a/external/rocm/include/bfloat16_dev.hpp
+++ b/external/rocm/include/bfloat16_dev.hpp
@@ -31,7 +31,7 @@ extern "C" {
 #endif

 #ifdef __HIP_PLATFORM_HCC__
-#define EXECUTION_SPECIFIER __device__
+#define EXECUTION_SPECIFIER __device__ __host__
 #else
 #define EXECUTION_SPECIFIER
 #endif // MIOPEN_BACKEND_HIP

--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
@@ -104,7 +104,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;

    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
+#elif 0
    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

@@ -132,7 +132,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;

    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
+#elif 1
    // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;


--- a/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -325,30 +325,30 @@ int main(int argc, char* argv[])
        // no initialization
        break;
    case 1:
-        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
        break;
    case 2:
-        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
        break;
    case 3:
-        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
        break;
    case 4:
-        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
        break;
    case 5:
-        out.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
        break;
    default:
-        out.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{1, 5}, num_thread);

        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+            return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
        };
        wei.GenerateTensorValue(gen_wei, num_thread);
    }

--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -77,13 +77,29 @@ void host_convolution_forward(const Tensor<TIn>& in,
                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in.mDesc.GetLengths()[3])
                    {
-                        v += static_cast<const double>(in(n, c, hi, wi)) *
-                             static_cast<const double>(wei(k, c, y, x));
+                        if constexpr(is_same<TIn, ushort>::value)
+                        {
+                            v += bfloat16_to_float(in(n, c, hi, wi)) *
+                                 bfloat16_to_float(wei(k, c, y, x));
+                        }
+                        else
+                        {
+                            v += static_cast<const double>(in(n, c, hi, wi)) *
+                                 static_cast<const double>(wei(k, c, y, x));
+                        }
                    }
                }
            }
        }
-        out(n, k, ho, wo) = v;
+
+        if constexpr(is_same<TOut, ushort>::value)
+        {
+            out(n, k, ho, wo) = float_to_bfloat16(v);
+        }
+        else
+        {
+            out(n, k, ho, wo) = v;
+        }
    };

    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
@@ -99,13 +115,28 @@ void host_convolution_forward(const Tensor<TIn>& in,
                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
                       wi < in.mDesc.GetLengths()[2])
                    {
-                        v += static_cast<const double>(in(n, hi, wi, c)) *
-                             static_cast<const double>(wei(k, y, x, c));
+                        if constexpr(is_same<TIn, ushort>::value)
+                        {
+                            v += bfloat16_to_float(in(n, hi, wi, c)) *
+                                 bfloat16_to_float(wei(k, y, x, c));
+                        }
+                        else
+                        {
+                            v += static_cast<const double>(in(n, hi, wi, c)) *
+                                 static_cast<const double>(wei(k, y, x, c));
+                        }
                    }
                }
            }
        }
-        out(n, ho, wo, k) = v;
+        if constexpr(is_same<TOut, ushort>::value)
+        {
+            out(n, ho, wo, k) = float_to_bfloat16(v);
+        }
+        else
+        {
+            out(n, ho, wo, k) = v;
+        }
    };

    if(layout == ConvTensorLayout::NCHW)
@@ -223,10 +254,14 @@ int main(int argc, char* argv[])
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;
-#elif 1
+#elif 0
    using in_data_t   = half_t;
    using acc_data_t  = float;
    using out_data_t  = half_t;
+#elif 1
+    using in_data_t  = ushort;
+    using acc_data_t = float;
+    using out_data_t = ushort;
 #elif 1
    using in_data_t  = int8_t;
    using acc_data_t = int32_t;
@@ -292,30 +327,30 @@ int main(int argc, char* argv[])
        // no initialization
        break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
        break;
    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
        break;
    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
        break;
    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
        break;
    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.5, 0.5}, num_thread);
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);

        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+            return GeneratorTensor_2<in_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
        };
        wei.GenerateTensorValue(gen_wei, num_thread);
    }

--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -297,30 +297,30 @@ int main(int argc, char* argv[])
        // no initialization
        break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
        break;
    case 2:
-        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_1<in_data_t>{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
        break;
    case 3:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1<out_data_t>{}, num_thread);
        break;
    case 4:
-        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2<out_data_t>{-5, 5}, num_thread);
        break;
    case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{-0.1, 0.1}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.1, 0.1}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_3<in_data_t>{-0.1, 0.1}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<out_data_t>{-0.1, 0.1}, num_thread);
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_2<in_data_t>{1, 5}, num_thread);

        auto gen_out = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+            return GeneratorTensor_2<out_data_t>{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
        };
        out.GenerateTensorValue(gen_out, num_thread);
    }

--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -10,7 +10,6 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdlops_mk_kn_mn.hpp"
@@ -31,6 +30,18 @@
 #define USE_GEMM_XDL_KM_KN_NM 0
 #define USE_GEMM_XDL_KM_NK_NM 0

+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM  // 7
+};
+
 enum GemmAlgo
 {
    Xdl_MK_KN_MN, // 0
@@ -43,6 +54,161 @@ enum GemmAlgo
    Xdl_KM_NK_NM, // 7
 };

+template <typename AType, typename BType, typename CType>
+void host_gemm(const Tensor<AType>& a,
+               const Tensor<BType>& b,
+               Tensor<CType>& c,
+               const GemmMatrixLayout layout)
+{
+    if(layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        auto f_mk_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        auto f_mk_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        auto f_km_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        auto f_km_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+
+            c(m, n) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_KN_NM)
+    {
+        auto f_mk_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_NM)
+    {
+        auto f_mk_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_NM)
+    {
+        auto f_km_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_NM)
+    {
+        auto f_km_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+
+            double v = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+
+            c(n, m) = v;
+        };
+
+        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}
 int main(int argc, char* argv[])
 {
    using namespace ck;
@@ -73,10 +239,14 @@ int main(int argc, char* argv[])
    using ab_data_t  = float;
    using acc_data_t = float;
    using c_data_t   = float;
-#elif 1
+#elif 0
    using ab_data_t  = half_t;
    using acc_data_t = float;
    using c_data_t   = half_t;
+#elif 1
+    using ab_data_t  = ushort;
+    using acc_data_t = float;
+    using c_data_t   = ushort;
 #elif 1
    using ab_data_t  = int8_t;
    using acc_data_t = int32_t;
@@ -155,24 +325,24 @@ int main(int argc, char* argv[])
        // no initialization
        break;
    case 1:
-        a.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        b.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        a.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
        break;
    case 2:
-        a.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        b.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        a.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
        break;
    case 3:
-        a.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        b.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        a.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_1<ab_data_t>{}, num_thread);
        break;
    case 4:
-        a.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        b.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        a.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_2<ab_data_t>{-5, 5}, num_thread);
        break;
    default:
-        a.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        b.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        a.GenerateTensorValue(GeneratorTensor_3<ab_data_t>{0.0, 1.0}, num_thread);
+        b.GenerateTensorValue(GeneratorTensor_3<ab_data_t>{-0.5, 0.5}, num_thread);
    }

 #if USE_GEMM_XDL_MK_KN_MN

--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
 #pragma once
 #include "host_tensor.hpp"
-#include "gemm_common.hpp"

-template <typename AType, typename BType, typename CType>
-void host_gemm(const Tensor<AType>& a,
-               const Tensor<BType>& b,
-               Tensor<CType>& c,
-               const GemmMatrixLayout layout)
+template <>
+void host_gemm<ushort, ushort, ushort>(const Tensor<ushort>& a,
+                                       const Tensor<ushort>& b,
+                                       Tensor<ushort>& c,
+                                       const GemmMatrixLayout layout)
 {
    if(layout == GemmMatrixLayout::MK_KN_MN)
    {
@@ -17,10 +16,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+                v += bfloat16_to_float(a(m, k)) * bfloat16_to_float(b(k, n));
            }

-            c(m, n) = v;
+            c(m, n) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -35,10 +34,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+                v += bfloat16_to_float(a(m, k)) * bfloat16_to_float(b(n, k));
            }

-            c(m, n) = v;
+            c(m, n) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -53,10 +52,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+                v += bfloat16_to_float(a(k, m)) * bfloat16_to_float(b(k, n));
            }

-            c(m, n) = v;
+            c(m, n) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -71,10 +70,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+                v += bfloat16_to_float(a(k, m)) * bfloat16_to_float(b(n, k));
            }

-            c(m, n) = v;
+            c(m, n) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -89,10 +88,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+                v += bfloat16_to_float(a(m, k)) * bfloat16_to_float(b(k, n));
            }

-            c(n, m) = v;
+            c(n, m) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -107,10 +106,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+                v += bfloat16_to_float(a(m, k)) * bfloat16_to_float(b(n, k));
            }

-            c(n, m) = v;
+            c(n, m) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -125,10 +124,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+                v += bfloat16_to_float(a(k, m)) * bfloat16_to_float(b(k, n));
            }

-            c(n, m) = v;
+            c(n, m) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
@@ -143,10 +142,10 @@ void host_gemm(const Tensor<AType>& a,

            for(int k = 0; k < K; ++k)
            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+                v += bfloat16_to_float(a(k, m)) * bfloat16_to_float(b(n, k));
            }

-            c(n, m) = v;
+            c(n, m) = float_to_bfloat16(v);
        };

        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(

--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -333,4 +333,41 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
 }

+float bf16_to_f32(ushort src_val)
+{
+    typedef union
+    {
+        ushort x, y;
+        float f32;
+    } bf16_f32_t;
+
+    bf16_f32_t v;
+    v.x = 0;
+    v.y = src_val;
+    return v.f32;
+}
+
+template <>
+void check_error<ushort>(const Tensor<ushort>& ref, const Tensor<ushort>& result)
+{
+    float error     = 0;
+    float max_diff  = -1;
+    float ref_value = 0, result_value = 0;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        error += std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
+        float diff = std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = bf16_to_f32(ref.mData[i]);
+            result_value = bf16_to_f32(result.mData[i]);
+        }
+    }
+
+    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", ref: " << ref_value << ", res: " << result_value
+              << std::endl;
+}
+
 #endif
--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -4,6 +4,7 @@
 #include <cmath>
 #include "config.hpp"

+template <typename T>
 struct GeneratorTensor_1
 {
    int value = 1;
@@ -15,6 +16,30 @@ struct GeneratorTensor_1
    }
 };

+template <>
+struct GeneratorTensor_1<ushort>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ushort operator()(Is...)
+    {
+        return float_to_bfloat16(value);
+    }
+};
+
+template <>
+struct GeneratorTensor_1<int8_t>
+{
+    int8_t value = 1;
+
+    template <typename... Is>
+    int8_t operator()(Is...)
+    {
+        return value;
+    }
+};
+
 struct GeneratorTensor_0
 {
    int value = 0;
@@ -26,6 +51,7 @@ struct GeneratorTensor_0
    }
 };

+template <typename T>
 struct GeneratorTensor_2
 {
    int min_value = 0;
@@ -38,6 +64,33 @@ struct GeneratorTensor_2
    }
 };

+template <>
+struct GeneratorTensor_2<ushort>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ushort operator()(Is...)
+    {
+        float tmp = (std::rand() % (max_value - min_value)) + min_value;
+        return float_to_bfloat16(tmp);
+    }
+};
+
+template <>
+struct GeneratorTensor_2<int8_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    int8_t operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_3
 {
@@ -53,6 +106,39 @@ struct GeneratorTensor_3
    }
 };

+template <>
+struct GeneratorTensor_3<ushort>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ushort operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp = min_value + tmp * (max_value - min_value);
+
+        return float_to_bfloat16(fp32_tmp);
+    }
+};
+
+template <>
+struct GeneratorTensor_3<int8_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    int8_t operator()(Is...)
+    {
+        int8_t min_tmp = static_cast<int8_t>(min_value);
+        int8_t max_tmp = static_cast<int8_t>(max_value);
+
+        return (std::rand() % (max_tmp - min_tmp)) + min_tmp;
+    }
+};
+
 struct GeneratorTensor_Checkboard
 {
    template <typename... Ts>

--- a/profiler/gemm_profiler.cpp
+++ b/profiler/gemm_profiler.cpp
@@ -9,13 +9,30 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_base.hpp"
 #include "device_gemm_xdl.hpp"
 #include "profile_gemm.hpp"

+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
 int gemm_profiler(int argc, char* argv[])
 {
    if(argc != 14)

--- a/script/profile_conv.sh
+++ b/script/profile_conv.sh
+#!/bin/bash
+
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=0
+
+ make -j ckProfiler
+
+ DRIVER="./profiler/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+IN_LAYOUT=$3
+WEI_LAYOUT=$4
+OUT_LAYOUT=$5
+VERIFY=$6
+INIT=$7
+LOG=$8
+REPEAT=$9
+
+# test
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+ $DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1   $DESIRED_GRID_SIZE
+
+
+
+#N=${10}
+
+# Resnet50
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048 1024 1 1   14   14    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 1024 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28   28    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   58   58    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7    7    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14   14    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   30   30    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  256 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  256 1 1   56   56    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3   16   16    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  512 1 1   28   28    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  512 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7    7    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7    7    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56   56    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+
+# SSD
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64    3 7 7  300  300   2   2     1   1    3   3     3   3
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128   64 1 1   75   75   2   2     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128   64 3 3   75   75   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 1 1   38   38   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 1 1   38   38   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  512  256 3 3   38   38   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  512 1 1   19   19   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  512  256 3 3   19   19   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  512 1 1   10   10   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3   10   10   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  256 1 1    5    5   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3    5    5   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  256 1 1    3    3   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3    3    3   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  340  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  512 3 3   19   19   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  512 3 3   10   10   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  256 3 3    5    5   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  340  256 3 3    3    3   1   1     1   1    1   1     1   1
+
+
--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
+#!/bin/bash
+
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=0
+
+ make -j ckProfiler
+
+ DRIVER="./profiler/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+REPEAT=$7
+
+########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   256  256  256      256     256     256
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   960 1024 1024     1024    1024    1024
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1024 1024 1024     1024    1024    1024
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1920 2048 2048     2048    2048    2048
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  3840 4096 4096     4096    4096    4096
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  7680 8192 8192     8192    8192    8192