initial commit

7e689d57 · aska-0096 · 7e689d57 · 7e689d57 · 7e689d57 · 7e689d57
Commit 7e689d57 authored Jul 18, 2024 by aska-0096
20 changed files
--- a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+++ b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include <random>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using DsDataType = ck::Tuple<>;
+using EDataType  = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = FastGelu;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::mt19937 gen(19391);
+    std::uniform_int_distribution<> distrib(1, 10);
+    int group_count = distrib(gen);
+
+    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
+
+    for(int i = 0; i < group_count; ++i)
+    {
+        Ms.push_back(256 + 256 * distrib(gen));
+        Ns.push_back(256 + 256 * distrib(gen));
+        Ks.push_back(128 + 128 * distrib(gen));
+
+        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
+        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
+        StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
+
+    a_dev_bufs.reserve(group_count);
+    b_dev_bufs.reserve(group_count);
+    e_dev_bufs.reserve(group_count);
+
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_e;
+
+    p_a.reserve(group_count);
+    p_b.reserve(group_count);
+    p_e.reserve(group_count);
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    for(int i = 0; i < group_count; ++i)
+    {
+        a_dev_bufs.emplace_back(sizeof(ADataType) *
+                                f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
+        b_dev_bufs.emplace_back(sizeof(BDataType) *
+                                f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
+        e_dev_bufs.emplace_back(sizeof(EDataType) *
+                                f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
+
+        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideEs[i], {}});
+
+        p_a.push_back(a_dev_bufs[i].GetDeviceBuffer());
+        p_b.push_back(b_dev_bufs[i].GetDeviceBuffer());
+        p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
+                                                                     BLayout,
+                                                                     DsLayout,
+                                                                     ELayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     DsDataType,
+                                                                     EDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CDEElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    auto p_ds = std::vector<std::array<const void*, 0>>{};
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = 0, num_btype = 0;
+            for(std::size_t j = 0; j < gemm_descs.size(); ++j)
+            {
+                flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
+
+                num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
+                             sizeof(EDataType) * Ms[j] * Ns[j];
+            }
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/18_groupnorm/CMakeLists.txt
+++ b/client_example/18_groupnorm/CMakeLists.txt
+add_executable(client_groupnorm_swish groupnorm_swish.cpp)
+target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = float;
+using BetaDataType    = float;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using Swish           = ck::tensor_operation::element_wise::Swish;
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+
+    std::size_t xy_size         = N * H * W * G * C;
+    std::size_t gamma_beta_size = G * C;
+
+    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       ComputeDataType,
+                                                                       YDataType,
+                                                                       Swish,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr =
+        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                            xy_strides,         // xStrides
+                                            gamma_beta_strides, // gammaStrides
+                                            gamma_beta_strides, // betaStrides
+                                            xy_strides,         // yStrides
+                                            {1, 2, 4},          // reduceDims
+                                            1e-6,
+                                            x_device_buf.GetDeviceBuffer(),
+                                            gamma_device_buf.GetDeviceBuffer(),
+                                            beta_device_buf.GetDeviceBuffer(),
+                                            y_device_buf.GetDeviceBuffer(),
+                                            nullptr,
+                                            nullptr,
+                                            Swish{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte =
+                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
+                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/19_pool/CMakeLists.txt
+++ b/client_example/19_pool/CMakeLists.txt
+add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
+target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations)
+
+add_executable(client_max_pool2d_bwd max_pool2d_bwd.cpp)
+target_link_libraries(client_max_pool2d_bwd PRIVATE composable_kernel::device_operations)
+
+add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
+target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations)
+
+add_executable(client_avg_pool3d_bwd avg_pool3d_bwd.cpp)
+target_link_libraries(client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations)
--- a/client_example/19_pool/avg_pool3d_bwd.cpp
+++ b/client_example/19_pool/avg_pool3d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
+
+using DOutDataType = ck::half_t;
+using DInDataType  = ck::half_t;
+
+using DOutLayout = ck::tensor_layout::convolution::NDHWC;
+using DInLayout  = ck::tensor_layout::convolution::NDHWC;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+    std::size_t mMemSize_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Z                 = 2;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Di                = 30;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_d   = 2;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_d = 1;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_d     = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_d    = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Do       = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
+
+    // Pool API only support the order of NCDHW
+    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
+    std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
+    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
+
+    std::size_t in_tensor_size  = N * C * Di * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Do * Ho * Wo;
+
+    // tensor layout = NDHWC
+    std::vector<ck::index_t> in_tensor_stride  = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
+
+    SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
+    SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceAvgPoolBwd<3, DOutDataType, DInDataType, DOutLayout, DInLayout>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            out_length,
+            in_length,
+            out_tensor_stride,
+            in_tensor_stride,
+            window_spatial_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            din_device_buf.SetZero();
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(DInDataType) + out_tensor_size * sizeof(DOutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            out_length,
+            in_length,
+            out_tensor_stride,
+            in_tensor_stride,
+            window_spatial_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            din_device_buf.SetZero();
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/19_pool/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool/avg_pool3d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
+#if 0
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = false;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Z                 = 2;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Di                = 30;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_d   = 2;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_d = 1;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_d     = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_d    = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Do       = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
+
+    // Pool API only support the order of NCDHW
+    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
+    std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations{
+        window_dilation_d, window_dilation_h, window_dilation_w};
+    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
+
+    std::size_t in_tensor_size  = N * C * Di * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Do * Ho * Wo;
+
+    // tensor layout = NDHWC
+    std::vector<ck::index_t> in_tensor_stride  = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
+
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/19_pool/max_pool2d_bwd.cpp
+++ b/client_example/19_pool/max_pool2d_bwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
+
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using DOutDataType  = ck::half_t;
+using DInDataType   = ck::half_t;
+using IndexDataType = int32_t;
+
+// We use pool3d to implement pool2d in this example
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
+                                  std::vector<ck::index_t>& window_lengths,
+                                  std::vector<ck::index_t>& output_lengths,
+                                  std::vector<ck::index_t>& input_stride,
+                                  std::vector<ck::index_t>& output_stride,
+                                  std::vector<ck::index_t>& indices_stride,
+                                  std::vector<ck::index_t>& window_strides,
+                                  std::vector<ck::index_t>& window_dilations,
+                                  std::vector<ck::index_t>& input_left_pads,
+                                  std::vector<ck::index_t>& input_right_pads,
+                                  std::vector<ck::index_t>& pooling_dims)
+{
+    // NCHW to NCDHW
+    input_lengths.insert(input_lengths.begin() + 2, 1);
+    output_lengths.insert(output_lengths.begin() + 2, 1);
+    input_stride.insert(input_stride.begin() + 2, 0);
+    output_stride.insert(output_stride.begin() + 2, 0);
+    indices_stride.insert(indices_stride.begin() + 2, 0);
+
+    // YX to ZYX
+    window_lengths.insert(window_lengths.begin(), 1);
+    window_strides.insert(window_strides.begin(), 0);
+    window_dilations.insert(window_dilations.begin(), 0);
+    input_left_pads.insert(input_left_pads.begin(), 0);
+    input_right_pads.insert(input_right_pads.begin(), 0);
+
+    pooling_dims = {2, 3, 4};
+}
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
+
+    // Pool API only support the order of NCHW
+    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
+    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations       = {window_dilation_h, window_dilation_w};
+    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+    std::vector<ck::index_t> pooling_dims           = {2, 3};
+
+    std::size_t in_tensor_size  = N * C * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Ho * Wo;
+
+    // tensor layout = NHWC
+    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
+
+    TransformPool2dparamToPool3d(in_length,
+                                 window_spatial_lengths,
+                                 out_length,
+                                 in_tensor_stride,
+                                 out_tensor_stride,
+                                 out_tensor_stride,
+                                 window_strides,
+                                 window_dilations,
+                                 input_left_pads,
+                                 input_right_pads,
+                                 pooling_dims);
+
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+    SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
+    SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
+
+    // Generate index data from max pool forward
+    {
+        using MaxPoolFwdDeviceOp =
+            ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                        WindowRank,
+                                                        InDataType,
+                                                        OutDataType,
+                                                        IndexDataType,
+                                                        InLayout,
+                                                        OutLayout,
+                                                        ck::ReduceTensorOp::MAX,
+                                                        true>;
+
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            MaxPoolFwdDeviceOp>::GetInstances();
+
+        auto& op_ptr = op_ptrs[0];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            pooling_dims);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+    }
+
+    // Run MaxPool bwd
+    using MaxPoolBwdDeviceOp =
+        ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        MaxPoolBwdDeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            out_tensor_size,
+            in_tensor_size,
+            window_spatial_lengths,
+            window_strides,
+            window_dilations);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = in_tensor_size * sizeof(DInDataType) +
+                                    out_tensor_size * sizeof(IndexDataType) +
+                                    out_tensor_size * sizeof(DOutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << "GB / s,"
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+            static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+            out_tensor_size,
+            in_tensor_size,
+            window_spatial_lengths,
+            window_strides,
+            window_dilations);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/19_pool/max_pool2d_fwd.cpp
+++ b/client_example/19_pool/max_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+
+// We use pool3d to implement pool2d in this example
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
+#if 1
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = true;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
+                                  std::vector<ck::index_t>& window_lengths,
+                                  std::vector<ck::index_t>& output_lengths,
+                                  std::vector<ck::index_t>& input_stride,
+                                  std::vector<ck::index_t>& output_stride,
+                                  std::vector<ck::index_t>& indices_stride,
+                                  std::vector<ck::index_t>& window_strides,
+                                  std::vector<ck::index_t>& window_dilations,
+                                  std::vector<ck::index_t>& input_left_pads,
+                                  std::vector<ck::index_t>& input_right_pads,
+                                  std::vector<ck::index_t>& pooling_dims)
+{
+    // NCHW to NCDHW
+    input_lengths.insert(input_lengths.begin() + 2, 1);
+    output_lengths.insert(output_lengths.begin() + 2, 1);
+    input_stride.insert(input_stride.begin() + 2, 0);
+    output_stride.insert(output_stride.begin() + 2, 0);
+    indices_stride.insert(indices_stride.begin() + 2, 0);
+
+    // YX to ZYX
+    window_lengths.insert(window_lengths.begin(), 1);
+    window_strides.insert(window_strides.begin(), 0);
+    window_dilations.insert(window_dilations.begin(), 0);
+    input_left_pads.insert(input_left_pads.begin(), 0);
+    input_right_pads.insert(input_right_pads.begin(), 0);
+
+    pooling_dims = {2, 3, 4};
+}
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N                 = 2;
+    ck::index_t C                 = 32;
+    ck::index_t Y                 = 2;
+    ck::index_t X                 = 2;
+    ck::index_t Hi                = 30;
+    ck::index_t Wi                = 30;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    ck::index_t Ho       = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    ck::index_t Wo       = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
+
+    // Pool API only support the order of NCHW
+    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
+    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> window_dilations       = {window_dilation_h, window_dilation_w};
+    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+    std::vector<ck::index_t> pooling_dims           = {2, 3};
+
+    std::size_t in_tensor_size  = N * C * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Ho * Wo;
+
+    // tensor layout = NHWC
+    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
+
+    TransformPool2dparamToPool3d(in_length,
+                                 window_spatial_lengths,
+                                 out_length,
+                                 in_tensor_stride,
+                                 out_tensor_stride,
+                                 out_tensor_stride,
+                                 window_strides,
+                                 window_dilations,
+                                 input_left_pads,
+                                 input_right_pads,
+                                 pooling_dims);
+
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 InLayout,
+                                                                 OutLayout,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            pooling_dims);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads,
+            pooling_dims);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/20_image_to_column/CMakeLists.txt
+++ b/client_example/20_image_to_column/CMakeLists.txt
+add_executable(client_image_to_column image_to_column.cpp)
+target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations)
--- a/client_example/20_image_to_column/image_to_column.cpp
+++ b/client_example/20_image_to_column/image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 32; // batch size
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Hi            = 28; // input H
+static constexpr ck::index_t Wi            = 28; // input W
+static constexpr ck::index_t Ho            = 28; // output H
+static constexpr ck::index_t Wo            = 28; // output W
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+
+    std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
+    std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
+    std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
+
+    // We have NHWGC in memory space (G is dummy)
+    // However, CK's API only accept length and stride with order of GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 2> out_strides{Y * X * C, 1};
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        in_strides,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads);
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
+                                    sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
+
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(avg_time < best_avg_time)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        in_strides,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
--- a/client_example/20_splitk_gemm/CMakeLists.txt
+++ b/client_example/20_splitk_gemm/CMakeLists.txt
+add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
+target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_operations)
--- a/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+++ b/client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = F8;
+using BDataType = F16;
+using CDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 8)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+
+        KBatch = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideC, KBatch\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        CDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op,
+                                                        KBatch);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op,
+                                                        KBatch);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/21_grouped_gemm_bias/CMakeLists.txt
+++ b/client_example/21_grouped_gemm_bias/CMakeLists.txt
+add_executable(client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp)
+target_link_libraries(client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_operations)
--- a/client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp
+++ b/client_example/21_grouped_gemm_bias/grouped_gemm_fixed_nk_bias_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include <random>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F32;
+using DsDataType = ck::Tuple<D0DataType>;
+using EDataType  = F32;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
+
+    int sum_of_m = 0;
+
+    Ms = {167, 183, 177, 181, 153, 139, 156, 173, 163, 150, 204, 184, 168, 156, 168, 148};
+
+    int group_count = Ms.size();
+
+    for(int i = 0; i < group_count; ++i)
+    {
+        Ns.push_back(768);
+        Ks.push_back(4608);
+
+        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
+        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
+        StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
+
+        sum_of_m += Ms[i];
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, d0_dev_bufs, e_dev_bufs;
+
+    a_dev_bufs.reserve(group_count);
+    b_dev_bufs.reserve(group_count);
+    d0_dev_bufs.reserve(group_count);
+    e_dev_bufs.reserve(group_count);
+
+    std::vector<void*> p_e;
+
+    p_e.reserve(group_count);
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
+        grouped_gemm_kernel_args_;
+    grouped_gemm_kernel_args_.reserve(group_count);
+
+    for(int i = 0; i < group_count; ++i)
+    {
+        a_dev_bufs.emplace_back(sizeof(ADataType) *
+                                f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
+        b_dev_bufs.emplace_back(sizeof(BDataType) *
+                                f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
+        d0_dev_bufs.emplace_back(sizeof(D0DataType) *
+                                 f_matrix_space_size(Ms[i], Ns[i], 0, D0Layout{}));
+        e_dev_bufs.emplace_back(sizeof(EDataType) *
+                                f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
+
+        gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
+
+        p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
+
+        grouped_gemm_kernel_args_.push_back(
+            {a_dev_bufs[i].GetDeviceBuffer(),
+             b_dev_bufs[i].GetDeviceBuffer(),
+             std::array<const void*, 1>{d0_dev_bufs[i].GetDeviceBuffer()},
+             e_dev_bufs[i].GetDeviceBuffer(),
+             Ms[i],
+             Ns[i],
+             Ks[i],
+             StrideAs[i],
+             StrideBs[i],
+             std::array<ck::index_t, 1>{0},
+             StrideEs[i]});
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
+                                                                            BLayout,
+                                                                            DsLayout,
+                                                                            ELayout,
+                                                                            ADataType,
+                                                                            BDataType,
+                                                                            DsDataType,
+                                                                            EDataType,
+                                                                            AElementOp,
+                                                                            BElementOp,
+                                                                            CDEElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    std::vector<const void*> p_a = {}, p_b = {};
+    std::vector<std::array<const void*, 1>> p_ds = {};
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        SimpleDeviceMem grouped_gemm_kernel_args_dev(
+            op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
+
+        SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
+                                    grouped_gemm_kernel_args_.data(),
+                                    op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
+                                    hipMemcpyHostToDevice));
+
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(),
+                                    grouped_gemm_workspace_dev.GetDeviceBuffer());
+
+        op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
+                                    grouped_gemm_kernel_args_dev.GetDeviceBuffer());
+
+        op_ptr->SetKBatch(argument_ptr.get(), 2);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = 0, num_btype = 0;
+            for(std::size_t j = 0; j < gemm_descs.size(); ++j)
+            {
+                flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
+
+                num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
+                             sizeof(EDataType) * Ms[j] * Ns[j];
+            }
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return 0;
+}
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++17)
+
+if (DTYPES)
+    add_definitions(-DDTYPES)
+    if (DTYPES MATCHES "int8")
+        add_definitions(-DCK_ENABLE_INT8)
+        if(NOT DEFINED ${CK_ENABLE_INT8})
+            set(CK_ENABLE_INT8 "ON")
+        endif()
+    endif()
+    if (DTYPES MATCHES "fp8")
+        add_definitions(-DCK_ENABLE_FP8)
+        if(NOT DEFINED ${CK_ENABLE_FP8})
+            set(CK_ENABLE_FP8 "ON")
+        endif()
+    endif()
+    if (DTYPES MATCHES "fp16")
+        add_definitions(-DCK_ENABLE_FP16)
+        if(NOT DEFINED ${CK_ENABLE_FP16})
+            set(CK_ENABLE_FP16 "ON")
+        endif()
+    endif()
+    if (DTYPES MATCHES "fp32")
+        add_definitions(-DCK_ENABLE_FP32)
+        if(NOT DEFINED ${CK_ENABLE_FP32})
+            set(CK_ENABLE_FP32 "ON")
+        endif()
+    endif()
+    if (DTYPES MATCHES "fp64")
+        add_definitions(-DCK_ENABLE_FP64)
+        if(NOT DEFINED ${CK_ENABLE_FP64})
+            set(CK_ENABLE_FP64 "ON")
+        endif()
+    endif()
+    if (DTYPES MATCHES "bf16")
+        add_definitions(-DCK_ENABLE_BF16)
+        if(NOT DEFINED ${CK_ENABLE_BF16})
+            set(CK_ENABLE_BF16 "ON")
+        endif()
+    endif()
+    message("DTYPES macro set to ${DTYPES}")
+else()
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    if(NOT DEFINED ${CK_ENABLE_ALL_DTYPES})
+        set(CK_ENABLE_ALL_DTYPES "ON")
+    endif()
+endif()
+
+find_package(composable_kernel COMPONENTS device_operations)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+# add all example subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    IF(IS_DIRECTORY "${subdir}" AND (NOT "${subdir}" MATCHES "build"))
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
--- a/client_example/README.md
+++ b/client_example/README.md
+##
+Client application links to CK library, and therefore CK library needs to be installed before building client applications.
+
+
+## Build
+```bash
+mkdir -p client_example/build
+cd client_example/build
+```
+
+```bash
+cmake                                                                 \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
+-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}"      \
+..
+```
+
+### Build client example
+```bash
+ make -j 
+```
--- a/cmake/Analyzers.cmake
+++ b/cmake/Analyzers.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+if(NOT TARGET analyze)
+    add_custom_target(analyze)
+endif()
+
+function(mark_as_analyzer)
+    add_dependencies(analyze ${ARGN})
+endfunction()
+
--- a/cmake/ClangTidy.cmake
+++ b/cmake/ClangTidy.cmake
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+include(CMakeParseArguments)
+include(Analyzers)
+
+get_filename_component(CLANG_TIDY_EXE_HINT "${CMAKE_CXX_COMPILER}" PATH)
+
+find_program(CLANG_TIDY_EXE
+    NAMES
+        clang-tidy
+        clang-tidy-5.0
+        clang-tidy-4.0
+        clang-tidy-3.9
+        clang-tidy-3.8
+        clang-tidy-3.7
+        clang-tidy-3.6
+        clang-tidy-3.5
+    HINTS
+        ${CLANG_TIDY_EXE_HINT}
+    PATH_SUFFIXES
+        compiler/bin
+    PATHS
+        /opt/rocm/llvm/bin
+        /opt/rocm/hcc
+        /usr/local/opt/llvm/bin
+)
+
+function(find_clang_tidy_version VAR)
+    execute_process(COMMAND ${CLANG_TIDY_EXE} -version OUTPUT_VARIABLE VERSION_OUTPUT)
+    separate_arguments(VERSION_OUTPUT_LIST UNIX_COMMAND "${VERSION_OUTPUT}")
+    list(FIND VERSION_OUTPUT_LIST "version" VERSION_INDEX)
+    if(VERSION_INDEX GREATER 0)
+        math(EXPR VERSION_INDEX "${VERSION_INDEX} + 1")
+        list(GET VERSION_OUTPUT_LIST ${VERSION_INDEX} VERSION)
+        set(${VAR} ${VERSION} PARENT_SCOPE)
+    else()
+        set(${VAR} "0.0" PARENT_SCOPE)
+    endif()
+
+endfunction()
+
+if( NOT CLANG_TIDY_EXE )
+    message( STATUS "Clang tidy not found" )
+    set(CLANG_TIDY_VERSION "0.0")
+else()
+    find_clang_tidy_version(CLANG_TIDY_VERSION)
+    message( STATUS "Clang tidy found: ${CLANG_TIDY_VERSION}")
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(CLANG_TIDY_FIXIT_DIR ${CMAKE_BINARY_DIR}/fixits)
+file(MAKE_DIRECTORY ${CLANG_TIDY_FIXIT_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CLANG_TIDY_FIXIT_DIR})
+
+macro(enable_clang_tidy)
+    set(options ANALYZE_TEMPORARY_DTORS ALL)
+    set(oneValueArgs HEADER_FILTER)
+    set(multiValueArgs CHECKS ERRORS EXTRA_ARGS)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CLANG_TIDY_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "," CLANG_TIDY_ERRORS "${PARSE_ERRORS}")
+    set(CLANG_TIDY_EXTRA_ARGS)
+    foreach(ARG ${PARSE_EXTRA_ARGS})
+        list(APPEND CLANG_TIDY_EXTRA_ARGS "-extra-arg=${ARG}")
+    endforeach()
+
+    set(CLANG_TIDY_ALL)
+    if(PARSE_ALL)
+        set(CLANG_TIDY_ALL ALL)
+    endif()
+
+    message(STATUS "Clang tidy checks: ${CLANG_TIDY_CHECKS}")
+
+    if (${PARSE_ANALYZE_TEMPORARY_DTORS})
+        set(CLANG_TIDY_ANALYZE_TEMPORARY_DTORS "-analyze-temporary-dtors")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_ERRORS_ARG "")
+    else()
+        set(CLANG_TIDY_ERRORS_ARG "-warnings-as-errors='${CLANG_TIDY_ERRORS}'")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_QUIET_ARG "")
+    else()
+        set(CLANG_TIDY_QUIET_ARG "-quiet")
+    endif()
+
+    if(PARSE_HEADER_FILTER)
+        string(REPLACE "$" "$$" CLANG_TIDY_HEADER_FILTER "${PARSE_HEADER_FILTER}")
+    else()
+        set(CLANG_TIDY_HEADER_FILTER ".*")
+    endif()
+
+    set(CLANG_TIDY_COMMAND
+        ${CLANG_TIDY_EXE}
+        ${CLANG_TIDY_QUIET_ARG}
+        -p ${CMAKE_BINARY_DIR}
+        -checks='${CLANG_TIDY_CHECKS}'
+        ${CLANG_TIDY_ERRORS_ARG}
+        ${CLANG_TIDY_EXTRA_ARGS}
+        ${CLANG_TIDY_ANALYZE_TEMPORARY_DTORS}
+        -header-filter='${CLANG_TIDY_HEADER_FILTER}'
+    )
+    add_custom_target(tidy ${CLANG_TIDY_ALL})
+    mark_as_analyzer(tidy)
+    add_custom_target(tidy-base)
+    add_custom_target(tidy-make-fixit-dir COMMAND ${CMAKE_COMMAND} -E make_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_custom_target(tidy-rm-fixit-dir COMMAND ${CMAKE_COMMAND} -E remove_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_dependencies(tidy-make-fixit-dir tidy-rm-fixit-dir)
+    add_dependencies(tidy-base tidy-make-fixit-dir)
+endmacro()
+
+function(clang_tidy_check TARGET)
+    get_target_property(SOURCES ${TARGET} SOURCES)
+    # TODO: Use generator expressions instead
+    # COMMAND ${CLANG_TIDY_COMMAND} $<TARGET_PROPERTY:${TARGET},SOURCES>
+    # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
+    foreach(SOURCE ${SOURCES})
+        if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
+            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            set(tidy_target tidy-target-${TARGET}-${tidy_file})
+            add_custom_target(${tidy_target}
+                # for some targets clang-tidy not able to get information from .clang-tidy
+                DEPENDS ${SOURCE}
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
+            )
+            add_dependencies(${tidy_target} ${TARGET})
+            add_dependencies(${tidy_target} tidy-base)
+            add_dependencies(tidy ${tidy_target})
+        endif()
+    endforeach()
+endfunction()
+
--- a/cmake/CppCheck.cmake
+++ b/cmake/CppCheck.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+include(CMakeParseArguments)
+include(ProcessorCount)
+include(Analyzers)
+
+find_program(CPPCHECK_EXE 
+    NAMES 
+        cppcheck
+    PATHS
+        /opt/rocm/bin
+)
+
+ProcessorCount(CPPCHECK_JOBS)
+
+set(CPPCHECK_BUILD_DIR ${CMAKE_BINARY_DIR}/cppcheck-build)
+file(MAKE_DIRECTORY ${CPPCHECK_BUILD_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CPPCHECK_BUILD_DIR})
+
+macro(enable_cppcheck)
+    set(options FORCE)
+    set(oneValueArgs)
+    set(multiValueArgs CHECKS SUPPRESS DEFINE UNDEFINE INCLUDE SOURCES)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CPPCHECK_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "\n" CPPCHECK_SUPPRESS "${PARSE_SUPPRESS};*:/usr/*")
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck-supressions "${CPPCHECK_SUPPRESS}")
+    set(CPPCHECK_DEFINES)
+    foreach(DEF ${PARSE_DEFINE})
+        set(CPPCHECK_DEFINES "${CPPCHECK_DEFINES} -D${DEF}")
+    endforeach()
+
+    set(CPPCHECK_UNDEFINES)
+    foreach(DEF ${PARSE_UNDEFINE})
+        set(CPPCHECK_UNDEFINES "${CPPCHECK_UNDEFINES} -U${DEF}")
+    endforeach()
+
+    set(CPPCHECK_INCLUDES)
+    foreach(INC ${PARSE_INCLUDE})
+        set(CPPCHECK_INCLUDES "${CPPCHECK_INCLUDES} -I${INC}")
+    endforeach()
+
+    # set(CPPCHECK_FORCE)
+    set(CPPCHECK_FORCE "--project=${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(PARSE_FORCE)
+        set(CPPCHECK_FORCE --force)
+    endif()
+
+    set(SOURCES)
+    set(GLOBS)
+    foreach(SOURCE ${PARSE_SOURCES})
+        get_filename_component(ABS_SOURCE ${SOURCE} ABSOLUTE)
+        if(EXISTS ${ABS_SOURCE})
+            if(IS_DIRECTORY ${ABS_SOURCE})
+                set(GLOBS "${GLOBS} ${ABS_SOURCE}/*.cpp ${ABS_SOURCE}/*.hpp ${ABS_SOURCE}/*.cxx ${ABS_SOURCE}/*.c ${ABS_SOURCE}/*.h")
+            else()
+                set(SOURCES "${SOURCES} ${ABS_SOURCE}")
+            endif()
+        else()
+            set(GLOBS "${GLOBS} ${ABS_SOURCE}")
+        endif()
+    endforeach()
+
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck.cmake "
+        file(GLOB_RECURSE GSRCS ${GLOBS})
+        set(CPPCHECK_COMMAND
+            ${CPPCHECK_EXE}
+            -q
+            # -v
+            # --report-progress
+            ${CPPCHECK_FORCE}
+            --cppcheck-build-dir=${CPPCHECK_BUILD_DIR}
+            --platform=native
+            --template=gcc
+            --error-exitcode=1
+            -j ${CPPCHECK_JOBS}
+            ${CPPCHECK_DEFINES}
+            ${CPPCHECK_UNDEFINES}
+            ${CPPCHECK_INCLUDES}
+            --enable=${CPPCHECK_CHECKS}
+            --inline-suppr
+            --suppressions-list=${CMAKE_BINARY_DIR}/cppcheck-supressions
+            ${SOURCES} \${GSRCS}
+        )
+        string(REPLACE \";\" \" \" CPPCHECK_SHOW_COMMAND \"\${CPPCHECK_COMMAND}\")
+        message(\"\${CPPCHECK_SHOW_COMMAND}\")
+        execute_process(
+            COMMAND \${CPPCHECK_COMMAND}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE RESULT
+        )
+        if(NOT RESULT EQUAL 0)
+            message(FATAL_ERROR \"Cppcheck failed\")
+        endif()
+")
+
+    add_custom_target(cppcheck
+        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/cppcheck.cmake
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "cppcheck: Running cppcheck..."
+    )
+    mark_as_analyzer(cppcheck)
+endmacro()
+
+
--- a/cmake/DoxygenDoc.cmake
+++ b/cmake/DoxygenDoc.cmake
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(MainDoc)
+
+find_program(DOXYGEN_EXECUTABLE NAMES doxygen
+    PATH_SUFFIXES bin
+    DOC "Doxygen documentation generator"
+)
+mark_as_advanced(DOXYGEN_EXECUTABLE)
+
+find_path(DOT_EXECUTABLE NAMES dot
+    PATH_SUFFIXES bin
+    DOC "Graphviz"
+)
+mark_as_advanced(DOT_EXECUTABLE)
+
+set(DOXYGEN_ARGS
+ABBREVIATE_BRIEF
+ALIASES
+ALLEXTERNALS
+ALLOW_UNICODE_NAMES
+ALPHABETICAL_INDEX
+ALWAYS_DETAILED_SEC
+AUTOLINK_SUPPORT
+BINARY_TOC
+BRIEF_MEMBER_DESC
+BUILTIN_STL_SUPPORT
+CALLER_GRAPH
+CALL_GRAPH
+CASE_SENSE_NAMES
+CHM_FILE
+CHM_INDEX_ENCODING
+CITE_BIB_FILES
+CLANG_ASSISTED_PARSING
+CLANG_OPTIONS
+CLASS_DIAGRAMS
+CLASS_GRAPH
+COLLABORATION_GRAPH
+COLS_IN_ALPHA_INDEX
+COMPACT_LATEX
+COMPACT_RTF
+CPP_CLI_SUPPORT
+CREATE_SUBDIRS
+DIAFILE_DIRS
+DIA_PATH
+DIRECTORY_GRAPH
+DISABLE_INDEX
+DISTRIBUTE_GROUP_DOC
+DOCBOOK_OUTPUT
+DOCBOOK_PROGRAMLISTING
+DOCSET_BUNDLE_ID
+DOCSET_FEEDNAME
+DOCSET_PUBLISHER_ID
+DOCSET_PUBLISHER_NAME
+DOTFILE_DIRS
+DOT_CLEANUP
+DOT_FONTNAME
+DOT_FONTPATH
+DOT_FONTSIZE
+DOT_GRAPH_MAX_NODES
+DOT_IMAGE_FORMAT
+DOT_MULTI_TARGETS
+DOT_NUM_THREADS
+# DOT_PATH
+DOT_TRANSPARENT
+DOXYFILE_ENCODING
+ECLIPSE_DOC_ID
+ENABLED_SECTIONS
+ENABLE_PREPROCESSING
+ENUM_VALUES_PER_LINE
+EXAMPLE_PATH
+EXAMPLE_PATTERNS
+EXAMPLE_RECURSIVE
+EXCLUDE
+EXCLUDE_PATTERNS
+EXCLUDE_SYMBOLS
+EXCLUDE_SYMLINKS
+EXPAND_AS_DEFINED
+EXPAND_ONLY_PREDEF
+EXTENSION_MAPPING
+EXTERNAL_GROUPS
+EXTERNAL_PAGES
+EXTERNAL_SEARCH
+EXTERNAL_SEARCH_ID
+EXTRACT_ALL
+EXTRACT_ANON_NSPACES
+EXTRACT_LOCAL_CLASSES
+EXTRACT_LOCAL_METHODS
+EXTRACT_PACKAGE
+EXTRACT_PRIVATE
+EXTRACT_STATIC
+EXTRA_PACKAGES
+EXTRA_SEARCH_MAPPINGS
+EXT_LINKS_IN_WINDOW
+FILE_PATTERNS
+FILE_VERSION_FILTER
+FILTER_PATTERNS
+FILTER_SOURCE_FILES
+FILTER_SOURCE_PATTERNS
+FORCE_LOCAL_INCLUDES
+FORMULA_FONTSIZE
+FORMULA_TRANSPARENT
+FULL_PATH_NAMES
+GENERATE_AUTOGEN_DEF
+GENERATE_BUGLIST
+GENERATE_CHI
+GENERATE_DEPRECATEDLIST
+GENERATE_DOCBOOK
+GENERATE_DOCSET
+GENERATE_ECLIPSEHELP
+GENERATE_HTML
+GENERATE_HTMLHELP
+GENERATE_LATEX
+GENERATE_LEGEND
+GENERATE_MAN
+GENERATE_PERLMOD
+GENERATE_QHP
+GENERATE_RTF
+GENERATE_TAGFILE
+GENERATE_TESTLIST
+GENERATE_TODOLIST
+GENERATE_TREEVIEW
+GENERATE_XML
+GRAPHICAL_HIERARCHY
+GROUP_GRAPHS
+GROUP_NESTED_COMPOUNDS
+# HAVE_DOT
+HHC_LOCATION
+HIDE_COMPOUND_REFERENCE
+HIDE_FRIEND_COMPOUNDS
+HIDE_IN_BODY_DOCS
+HIDE_SCOPE_NAMES
+HIDE_UNDOC_CLASSES
+HIDE_UNDOC_MEMBERS
+HIDE_UNDOC_RELATIONS
+HTML_COLORSTYLE_GAMMA
+HTML_COLORSTYLE_HUE
+HTML_COLORSTYLE_SAT
+HTML_DYNAMIC_SECTIONS
+HTML_EXTRA_FILES
+HTML_EXTRA_STYLESHEET
+HTML_FILE_EXTENSION
+HTML_FOOTER
+HTML_HEADER
+HTML_INDEX_NUM_ENTRIES
+HTML_OUTPUT
+HTML_STYLESHEET
+HTML_TIMESTAMP
+IDL_PROPERTY_SUPPORT
+IGNORE_PREFIX
+IMAGE_PATH
+INCLUDED_BY_GRAPH
+INCLUDE_FILE_PATTERNS
+INCLUDE_GRAPH
+INCLUDE_PATH
+INHERIT_DOCS
+INLINE_GROUPED_CLASSES
+INLINE_INFO
+INLINE_INHERITED_MEMB
+INLINE_SIMPLE_STRUCTS
+INLINE_SOURCES
+INPUT
+INPUT_ENCODING
+INPUT_FILTER
+INTERACTIVE_SVG
+INTERNAL_DOCS
+JAVADOC_AUTOBRIEF
+LATEX_BATCHMODE
+LATEX_BIB_STYLE
+LATEX_CMD_NAME
+LATEX_EXTRA_FILES
+LATEX_EXTRA_STYLESHEET
+LATEX_FOOTER
+LATEX_HEADER
+LATEX_HIDE_INDICES
+LATEX_OUTPUT
+LATEX_SOURCE_CODE
+LATEX_TIMESTAMP
+LAYOUT_FILE
+LOOKUP_CACHE_SIZE
+MACRO_EXPANSION
+MAKEINDEX_CMD_NAME
+MAN_EXTENSION
+MAN_LINKS
+MAN_OUTPUT
+MAN_SUBDIR
+MARKDOWN_SUPPORT
+MATHJAX_CODEFILE
+MATHJAX_EXTENSIONS
+MATHJAX_FORMAT
+MATHJAX_RELPATH
+MAX_DOT_GRAPH_DEPTH
+MAX_INITIALIZER_LINES
+MSCFILE_DIRS
+MSCGEN_PATH
+MULTILINE_CPP_IS_BRIEF
+OPTIMIZE_FOR_FORTRAN
+OPTIMIZE_OUTPUT_FOR_C
+OPTIMIZE_OUTPUT_JAVA
+OPTIMIZE_OUTPUT_VHDL
+OUTPUT_DIRECTORY
+OUTPUT_LANGUAGE
+PAPER_TYPE
+PDF_HYPERLINKS
+PERLMOD_LATEX
+PERLMOD_MAKEVAR_PREFIX
+PERLMOD_PRETTY
+PERL_PATH
+PLANTUML_CFG_FILE
+PLANTUML_INCLUDE_PATH
+PLANTUML_JAR_PATH
+PREDEFINED
+PROJECT_BRIEF
+PROJECT_LOGO
+PROJECT_NAME
+PROJECT_NUMBER
+QCH_FILE
+QHG_LOCATION
+QHP_CUST_FILTER_ATTRS
+QHP_CUST_FILTER_NAME
+QHP_NAMESPACE
+QHP_SECT_FILTER_ATTRS
+QHP_VIRTUAL_FOLDER
+QT_AUTOBRIEF
+QUIET
+RECURSIVE
+REFERENCED_BY_RELATION
+REFERENCES_LINK_SOURCE
+REFERENCES_RELATION
+REPEAT_BRIEF
+RTF_EXTENSIONS_FILE
+RTF_HYPERLINKS
+RTF_OUTPUT
+RTF_SOURCE_CODE
+RTF_STYLESHEET_FILE
+SEARCHDATA_FILE
+SEARCHENGINE
+SEARCHENGINE_URL
+SEARCH_INCLUDES
+SEPARATE_MEMBER_PAGES
+SERVER_BASED_SEARCH
+SHORT_NAMES
+SHOW_FILES
+SHOW_GROUPED_MEMB_INC
+SHOW_INCLUDE_FILES
+SHOW_NAMESPACES
+SHOW_USED_FILES
+SIP_SUPPORT
+SKIP_FUNCTION_MACROS
+SORT_BRIEF_DOCS
+SORT_BY_SCOPE_NAME
+SORT_GROUP_NAMES
+SORT_MEMBERS_CTORS_1ST
+SORT_MEMBER_DOCS
+SOURCE_BROWSER
+SOURCE_TOOLTIPS
+STRICT_PROTO_MATCHING
+STRIP_CODE_COMMENTS
+STRIP_FROM_INC_PATH
+STRIP_FROM_PATH
+SUBGROUPING
+TAB_SIZE
+TAGFILES
+TCL_SUBST
+TEMPLATE_RELATIONS
+TOC_EXPAND
+TOC_INCLUDE_HEADINGS
+TREEVIEW_WIDTH
+TYPEDEF_HIDES_STRUCT
+UML_LIMIT_NUM_FIELDS
+UML_LOOK
+USE_HTAGS
+USE_MATHJAX
+USE_MDFILE_AS_MAINPAGE
+USE_PDFLATEX
+VERBATIM_HEADERS
+WARNINGS
+WARN_AS_ERROR
+WARN_FORMAT
+WARN_IF_DOC_ERROR
+WARN_IF_UNDOCUMENTED
+WARN_LOGFILE
+WARN_NO_PARAMDOC
+XML_OUTPUT
+XML_PROGRAMLISTING
+)
+
+set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
+
+function(add_doxygen_doc)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs DEPENDS ${DOXYGEN_ARGS})
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    file(WRITE ${DOXYGEN_CONFIG_FILE} "# Auto-generated doxygen configuration file\n")
+
+    foreach(ARG ${DOXYGEN_ARGS})
+        if(PARSE_${ARG})
+            string(REPLACE ";" " " ARG_VALUE ${PARSE_${ARG}})
+            file(APPEND ${DOXYGEN_CONFIG_FILE} "\n${ARG} = ${ARG_VALUE}\n")
+        endif()
+    endforeach()
+
+    if(PARSE_OUTPUT_DIRECTORY)
+        if(NOT EXISTS ${PARSE_OUTPUT_DIRECTORY})
+            file(MAKE_DIRECTORY ${PARSE_OUTPUT_DIRECTORY})
+        endif()
+    endif()
+
+    if(DOT_EXECUTABLE)
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nDOT_PATH = \"${DOT_EXECUTABLE}\"\n")
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = YES\n")
+    else()
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = NO\n")
+    endif()
+
+    add_custom_target(doxygen
+        ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONFIG_FILE}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Building documentation with doxygen"
+    )
+    if(PARSE_OUTPUT_DIRECTORY)
+        clean_doc_output(${PARSE_OUTPUT_DIRECTORY})
+    endif()
+    mark_as_doc(doxygen)
+    if(PARSE_DEPENDS)
+        add_dependencies(doxygen ${PARSE_DEPENDS})
+    endif()
+endfunction()