FP16 data in-register transpose (#41)

* start fixing 16bit data packing * adding StaticTensor * adding StaticTensor * adding StaticTensor * add missing constexpr * adding static tensor * adding static tensor * adding transpose * add inline asm for transpose 2x2 of half_t * add general transpose_vectors(), but have unnecessary register initialization using v_mov * fix unnecessary register initialization in transpose_vector by using more pass-by-reference * add hardcoded logic for NHWC wrw * improve asm for v_pack * make ThreadwiseTensorSliceTransfer_v3r2 support any tensor * tweak * reorganize file

FP16 data in-register transpose (#41)
* start fixing 16bit data packing * adding StaticTensor * adding StaticTensor * adding StaticTensor * add missing constexpr * adding static tensor * adding static tensor * adding transpose * add inline asm for transpose 2x2 of half_t * add general transpose_vectors(), but have unnecessary register initialization using v_mov * fix unnecessary register initialization in transpose_vector by using more pass-by-reference * add hardcoded logic for NHWC wrw * improve asm for v_pack * make ThreadwiseTensorSliceTransfer_v3r2 support any tensor * tweak * reorganize file
b491ebf3 · Chao Liu · GitHub · e823d518 · e823d518 · b491ebf3
Unverified Commit b491ebf3 authored Nov 15, 2021 by Chao Liu Committed by GitHub Nov 15, 2021
7 changed files
--- a/device_operation/include/gemm_common.hpp
+++ b/device_operation/include/gemm_common.hpp
-#ifndef GEMM_COMMON_HPP
-#define GEMM_COMMON_HPP
-enum GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-    KM_KN_MN, // 2
-    KM_NK_MN, // 3
-    MK_KN_NM, // 4
-    MK_NK_NM, // 5
-    KM_KN_NM, // 6
-    KM_NK_NM, // 7
-};
-enum GemmDataType
-{
-    F32_F32_F32, // 0
-    F16_F16_F16, // 1
-};
-#endif
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
@@ -104,7 +104,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
+#elif 0
    // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;
@@ -132,7 +132,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
+#elif 1
    // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
    constexpr index_t BlockSize = 256;

--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -10,7 +10,6 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdlops_mk_kn_mn.hpp"
@@ -31,6 +30,18 @@
 #define USE_GEMM_XDL_KM_KN_NM 0
 #define USE_GEMM_XDL_KM_NK_NM 0
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM  // 7
+};
 enum GemmAlgo
 {
    Xdl_MK_KN_MN, // 0
@@ -43,6 +54,161 @@ enum GemmAlgo
    Xdl_KM_NK_NM, // 7
 };
+template <typename AType, typename BType, typename CType>
+void host_gemm(const Tensor<AType>& a,
+               const Tensor<BType>& b,
+               Tensor<CType>& c,
+               const GemmMatrixLayout layout)
+{
+    if(layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        auto f_mk_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+            c(m, n) = v;
+        };
+        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        auto f_mk_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[1];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+            c(m, n) = v;
+        };
+        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        auto f_km_kn_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+            c(m, n) = v;
+        };
+        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        auto f_km_nk_mn = [&](auto m, auto n) {
+            const int K = a.mDesc.GetLengths()[0];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+            c(m, n) = v;
+        };
+        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_KN_NM)
+    {
+        auto f_mk_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
+            }
+            c(n, m) = v;
+        };
+        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::MK_NK_NM)
+    {
+        auto f_mk_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[1];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
+            }
+            c(n, m) = v;
+        };
+        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_KN_NM)
+    {
+        auto f_km_kn_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
+            }
+            c(n, m) = v;
+        };
+        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else if(layout == GemmMatrixLayout::KM_NK_NM)
+    {
+        auto f_km_nk_nm = [&](auto n, auto m) {
+            const int K = a.mDesc.GetLengths()[0];
+            double v = 0;
+            for(int k = 0; k < K; ++k)
+            {
+                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
+            }
+            c(n, m) = v;
+        };
+        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
+            std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}
 int main(int argc, char* argv[])
 {
    using namespace ck;

--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
 #pragma once
 #include "host_tensor.hpp"
-#include "gemm_common.hpp"
-template <typename AType, typename BType, typename CType>
-void host_gemm(const Tensor<AType>& a,
-               const Tensor<BType>& b,
-               Tensor<CType>& c,
-               const GemmMatrixLayout layout)
-{
-    if(layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        auto f_mk_kn_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[1];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
-            }
-            c(m, n) = v;
-        };
-        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        auto f_mk_nk_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[1];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
-            }
-            c(m, n) = v;
-        };
-        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_KN_MN)
-    {
-        auto f_km_kn_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[0];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
-            }
-            c(m, n) = v;
-        };
-        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_NK_MN)
-    {
-        auto f_km_nk_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[0];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
-            }
-            c(m, n) = v;
-        };
-        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_KN_NM)
-    {
-        auto f_mk_kn_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[1];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
-            }
-            c(n, m) = v;
-        };
-        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_NK_NM)
-    {
-        auto f_mk_nk_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[1];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
-            }
-            c(n, m) = v;
-        };
-        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_KN_NM)
-    {
-        auto f_km_kn_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[0];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
-            }
-            c(n, m) = v;
-        };
-        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_NK_NM)
-    {
-        auto f_km_nk_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[0];
-            double v = 0;
-            for(int k = 0; k < K; ++k)
-            {
-                v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
-            }
-            c(n, m) = v;
-        };
-        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
 template <typename AType, typename BType, typename CType>
 void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,

--- a/profiler/gemm_profiler.cpp
+++ b/profiler/gemm_profiler.cpp
@@ -9,13 +9,30 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_base.hpp"
 #include "device_gemm_xdl.hpp"
 #include "profile_gemm.hpp"
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+enum GemmDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
 int gemm_profiler(int argc, char* argv[])
 {
    if(argc != 14)

--- a/script/profile_conv.sh
+++ b/script/profile_conv.sh
+#!/bin/bash
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=0
+ make -j ckProfiler
+ DRIVER="./profiler/ckProfiler"
+OP=$1
+DATATYPE=$2
+IN_LAYOUT=$3
+WEI_LAYOUT=$4
+OUT_LAYOUT=$5
+VERIFY=$6
+INIT=$7
+LOG=$8
+REPEAT=$9
+# test
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+ $DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1   $DESIRED_GRID_SIZE
+#N=${10}
+# Resnet50
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048 1024 1 1   14   14    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 1024 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28   28    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   58   58    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7    7    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14   14    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14   14    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   30   30    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  256 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  256 1 1   56   56    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3   16   16    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  512 1 1   28   28    2  2      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  512 1 1   28   28    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7    7    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7    7    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 1 1   56   56    1  1      1  1     0  0      0  0   $DESIRED_GRID_SIZE
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56   56    1  1      1  1     1  1      1  1   $DESIRED_GRID_SIZE
+# SSD
+########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads  Desired_grid_size__
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64    3 7 7  300  300   2   2     1   1    3   3     3   3
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120   64   64 3 3   75   75   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128   64 1 1   75   75   2   2     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128   64 3 3   75   75   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 1 1   38   38   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  256 1 1   38   38   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  512  256 3 3   38   38   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  512 1 1   19   19   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  512  256 3 3   19   19   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  512 1 1   10   10   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3   10   10   2   2     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  256 1 1    5    5   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3    5    5   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  128  256 1 1    3    3   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  256  128 3 3    3    3   1   1     1   1    0   0     0   0
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  340  256 3 3   38   38   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  512 3 3   19   19   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  512 3 3   10   10   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  510  256 3 3    5    5   1   1     1   1    1   1     1   1
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT  120  340  256 3 3    3    3   1   1     1   1    1   1     1   1
--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
+#!/bin/bash
+## GPU visibility
+ export HIP_VISIBLE_DEVICES=0
+ make -j ckProfiler
+ DRIVER="./profiler/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+REPEAT=$7
+########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   256  256  256      256     256     256
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT   960 1024 1024     1024    1024    1024
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1024 1024 1024     1024    1024    1024
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  1920 2048 2048     2048    2048    2048
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  3840 4096 4096     4096    4096    4096
+#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  7680 8192 8192     8192    8192    8192