add profiler

e730aeb7 · carlushuang · 2f5ae075 · e730aeb7 · e730aeb7 · e730aeb7
Commit e730aeb7 authored May 12, 2023 by carlushuang
10 changed files
--- a/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmStreamK : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t NumSKBlocks = 0) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmStreamKPtr = std::unique_ptr<DeviceGemmStreamK<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp"
 #include "ck/host_utility/device_prop.hpp"
@@ -58,7 +58,7 @@ template <typename ADataType,
          index_t CShuffleNRepeatPerShuffle,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXDL>
-struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
+struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
                                                       BLayout,
                                                       CLayout,
                                                       ADataType,
@@ -185,7 +185,8 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
                             index_t StrideC,
                             AElementwiseOperation,
                             BElementwiseOperation,
-                             CElementwiseOperation)
+                             CElementwiseOperation,
+                             index_t NumSKBlocks = 0)
    {
        const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm>;
        int occupancy, num_cu;
@@ -212,7 +213,8 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
                        StrideB,
                        StrideC,
                        static_cast<uint32_t>(num_cu),
-                        static_cast<uint32_t>(occupancy)};
+                        static_cast<uint32_t>(occupancy),
+                        static_cast<uint32_t>(NumSKBlocks)};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -229,7 +231,8 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
                                                      index_t StrideC,
                                                      AElementwiseOperation,
                                                      BElementwiseOperation,
-                                                      CElementwiseOperation) override
+                                                      CElementwiseOperation,
+                                                      index_t NumSKBlocks = 0) override
    {
        const auto kernel = kernel_gemm_xdlops_streamk<GridwiseGemm>;
        int occupancy, num_cu;
@@ -256,7 +259,8 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
                                          StrideB,
                                          StrideC,
                                          static_cast<uint32_t>(num_cu),
-                                          static_cast<uint32_t>(occupancy));
+                                          static_cast<uint32_t>(occupancy),
+                                          static_cast<uint32_t>(NumSKBlocks));
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -671,8 +671,13 @@ struct BlockToCTileMap_GemmStreamK
    }
    // prefer construct on host
-    BlockToCTileMap_GemmStreamK(
+    BlockToCTileMap_GemmStreamK(uint32_t m,
-        uint32_t m, uint32_t n, uint32_t k, uint32_t num_cu, uint32_t occupancy, uint32_t tile_swizzle_sub_m_factor = 8)
+                                uint32_t n,
+                                uint32_t k,
+                                uint32_t num_cu,
+                                uint32_t occupancy,
+                                uint32_t sk_blocks                 = 0,
+                                uint32_t tile_swizzle_sub_m_factor = 8)
    {
        uint32_t num_tiles =
            math::integer_divide_ceil(m, MPerBlock) * math::integer_divide_ceil(n, NPerBlock);
@@ -771,6 +776,8 @@ struct BlockToCTileMap_GemmStreamK
                sk_num_blocks = 0;
            }
+            // give a chance to control num of sk blocks
+            sk_num_blocks = sk_blocks != 0 ? sk_blocks : sk_num_blocks;
            sk_num_blocks = env_get_int("sk_num_blocks", sk_num_blocks);
            if(sk_num_blocks == 0)
@@ -804,10 +811,15 @@ struct BlockToCTileMap_GemmStreamK
            }
        }
        n_tiles = MDiv(math::integer_divide_ceil(n, NPerBlock));
+        tile_swizzle_sub_m_factor =
+            env_get_int("tile_swizzle_sub_m_factor", tile_swizzle_sub_m_factor);
        tile_swizzle_sub_m = MDiv(tile_swizzle_sub_m_factor);
-        tile_swizzle_sub_m_rem = MDiv(math::integer_divide_ceil(m, MPerBlock) % tile_swizzle_sub_m_factor);
+        tile_swizzle_sub_m_rem =
+            MDiv(math::integer_divide_ceil(m, MPerBlock) % tile_swizzle_sub_m_factor);
-        printf("cu:%d, occupancy:%d, grids:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, sk_num_blocks:%d, "
+        printf("cu:%d, occupancy:%d, grids:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, "
+               "sk_num_blocks:%d, "
               "sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
               "k_iters_per_tile:%d, k_iters_per_big_block:%d\n",
               num_cu,
@@ -889,8 +901,9 @@ struct BlockToCTileMap_GemmStreamK
        uint32_t quo_sub_m, rem_sub_m;
        tile_swizzle_sub_m.divmod(m_tile_idx, quo_sub_m, rem_sub_m);
-        const auto sub_m_adapt = (m_tile_idx < (m_tiles - tile_swizzle_sub_m_rem.get())) ?
+        const auto sub_m_adapt = (m_tile_idx < (m_tiles - tile_swizzle_sub_m_rem.get()))
-            tile_swizzle_sub_m : tile_swizzle_sub_m_rem;
+                                     ? tile_swizzle_sub_m
+                                     : tile_swizzle_sub_m_rem;
        uint32_t m_tile_idx_sub0, m_tile_idx_sub1;
        tile_swizzle_sub_m.divmod(m_tile_idx, m_tile_idx_sub0, m_tile_idx_sub1);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -121,7 +121,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                 index_t StrideB_,
                 index_t StrideC_,
                 uint32_t num_cu,
-                 uint32_t occupancy)
+                 uint32_t occupancy,
+                 uint32_t num_sk_blocks_)
            : p_a_grid(p_a_grid_),
              p_b_grid(p_b_grid_),
              p_c_grid(p_c_grid_),
@@ -131,7 +132,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
              StrideA(StrideA_),
              StrideB(StrideB_),
              StrideC(StrideC_),
-              block_mapping(M, N, K, num_cu, occupancy)
+              block_mapping(M, N, K, num_cu, occupancy, num_sk_blocks_)
        {
        }
@@ -384,13 +385,13 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
    }
    // return block_id to C matrix tile idx (m0, n0, k_split) mapping
-    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
+    // __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
-    {
+    // {
-        return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
+    //     return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
-    }
+    // }
    using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1, 1, 1))>;
-    using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;
+    // using DefaultBlock2CTileMap = remove_cvref_t<decltype(MakeDefaultBlock2CTileMap())>;
    __device__ static void Run(const Argument& karg, void* __restrict__ p_shared_block)
    {
@@ -474,7 +475,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
        block_mapping.get_block_itr(block_idx, iter_start, iter_end);
        uint32_t total_iter_length = iter_end - iter_start;
        // if(threadIdx.x == 0)
-        //     printf("xxx bid:%d, is_sk_block:%d, is_dp_block:%d\n", static_cast<int>(blockIdx.x), is_sk_block, is_dp_block);
+        //     printf("xxx bid:%d, is_sk_block:%d, is_dp_block:%d\n", static_cast<int>(blockIdx.x),
+        //     is_sk_block, is_dp_block);
        if(!is_sk_block && !is_dp_block)
            return;
@@ -497,7 +499,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
                __builtin_amdgcn_readfirstlane(iter_offset * K0PerBlock);
            // if(threadIdx.x == 0)
-        //     printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d | %d), iter_end:%d, len:%d\n",
+            //     printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d |
+            //     %d), iter_end:%d, len:%d\n",
            //         is_sk_block ? "sk_block" : (is_dp_block ? "dp_block" : "other "),
            //         static_cast<int>(blockIdx.x), block_idx, tile_idx, m_block_data_idx_on_grid,
            //         n_block_data_idx_on_grid, k0_block_data_idx_on_grid, iter_end -

--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmStreamK<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmStreamK<
+    ALayout,
+    BLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    CDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmStreamK<ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough>;
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#if 0
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<CDataType, float>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt
+add_instance_library(device_gemm_streamk_instance
+   # device_gemm_xdl_streamk_f32_f32_f32_mk_kn_mn_instance.cpp
+   # device_gemm_xdl_streamk_f32_f32_f32_mk_nk_mn_instance.cpp
+   # device_gemm_xdl_streamk_f32_f32_f32_km_kn_mn_instance.cpp
+   # device_gemm_xdl_streamk_f32_f32_f32_km_nk_mn_instance.cpp
+   device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
+   # device_gemm_xdl_streamk_f16_f16_f16_mk_nk_mn_instance.cpp
+   # device_gemm_xdl_streamk_f16_f16_f16_km_kn_mn_instance.cpp
+   # device_gemm_xdl_streamk_f16_f16_f16_km_nk_mn_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// static constexpr auto GemmMNPadding =
+// ck::tensor_operation::device::GemmSpecialization::MNPadding;
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##################|     |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   192,     4,  8,   32,   32,    1,    3,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 48, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   192,    64,     4,  8,   32,   32,    3,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   192,     4,  8,   32,   32,    1,    3,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 24, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   192,    32,     4,  8,   32,   32,    3,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,    32,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlStreamK<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                   S<1, 32, 1, 4>,               8>
+    // clang-format on
+    >;
+void add_device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmStreamK<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instances{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_streamk_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_streamk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+namespace ck {
+namespace profiler {
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_gemm_streamk_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               bool time_kernel,
+                               int M,
+                               int N,
+                               int K,
+                               int StrideA,
+                               int StrideB,
+                               int StrideC,
+                               int NumSKBlocks = 0)
+{
+    bool pass = true;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{0, 1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmStreamK<ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        StrideA,
+                                        StrideB,
+                                        StrideC,
+                                        a_element_op,
+                                        b_element_op,
+                                        c_element_op,
+                                        NumSKBlocks);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+            std::string op_name = op_ptr->GetTypeString();
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
+              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    return pass;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -3,6 +3,7 @@ set(PROFILER_SOURCES
    profiler.cpp
    profile_gemm.cpp
    profile_gemm_splitk.cpp
+    profile_gemm_streamk.cpp
    profile_gemm_bilinear.cpp
    profile_gemm_bias_add_reduce.cpp
    profile_gemm_add_add_fastgelu.cpp
@@ -40,6 +41,7 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)

--- a/profiler/src/profile_gemm_streamk.cpp
+++ b/profiler/src/profile_gemm_streamk.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_gemm_streamk_impl.hpp"
+#include "profiler_operation_registry.hpp"
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+#define OP_NAME "gemm_streamk"
+#define OP_DESC "StreamK GEMM"
+int profile_gemm_streamk(int argc, char* argv[])
+{
+    if(argc < 14)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: num_sk_blocks (optional)\n");
+        exit(1);
+    }
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA     = std::stoi(argv[11]);
+    const int StrideB     = std::stoi(argv[12]);
+    const int StrideC     = std::stoi(argv[13]);
+    const int NumSKBlocks = argc >= 15 ? std::stoi(argv[14]) : 0;
+    using F32 = float;
+    using F16 = ck::half_t;
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using CDataType   = decltype(c_type);
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+        bool pass = ck::profiler::profile_gemm_streamk_impl<ADataType,
+                                                            BDataType,
+                                                            AccDataType,
+                                                            CDataType,
+                                                            ALayout,
+                                                            BLayout,
+                                                            CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA <= 0) ? DefaultStrideA : StrideA,
+            (StrideB <= 0) ? DefaultStrideB : StrideB,
+            (StrideC <= 0) ? DefaultStrideC : StrideC,
+            NumSKBlocks);
+        return pass ? 0 : 1;
+    };
+    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_streamk);