Merge branch 'amd-develop' into amd-master

fa9da1a4 · Jun Liu · 4c105089 · 457308e3 · fa9da1a4 · fa9da1a4
Commit fa9da1a4 authored Jun 19, 2023 by Jun Liu
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
 - Added multi-embeddings support (#542).
 - Added Navi3x blockwise GEMM and real GEMM support (#541).
 - Added Navi grouped ConvBwdWeight support (#505).
+- Added pool3d forward (#697).
+- Added maxpool backward (#750).
 ### Changed
 - Changed ...
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -4,7 +4,7 @@ This is the list of developers and contributors to Composable Kernel library
 ## Developers
-[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
 [Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
+ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=5.6
 ARG compiler_version=""
 ARG compiler_commit=""
@@ -9,23 +9,30 @@ RUN set -xe
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
 # Add rocm repository
+RUN chmod 1777 /tmp
 RUN apt-get update
-RUN apt-get install -y wget gnupg curl
+RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
-RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
+RUN if [ "$ROCMVERSION" != "5.6" ]; then \
-	wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
-    else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
         apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
         amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
-         DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
+         amdgpu-install -y --usecase=rocm --no-dkms; \
+    elif [ "$ROCMVERSION" = "5.6" ] && [ "$compiler_version" = "rc3" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
+         sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb" && \
+         apt update && apt-get install -y ./amdgpu-install-internal_5.6-20.04-1_all.deb && \
+         sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45  > /etc/apt/sources.list.d/rocm-build.list' && \
+         amdgpu-repo --amdgpu-build=1602498 && amdgpu-install -y --usecase=rocm --no-dkms; \
    fi
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    apt-utils \
    build-essential \
    ccache \
    cmake \
@@ -38,16 +45,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libpthread-stubs0-dev \
    llvm-amdgpu \
    pkg-config \
-    python \
    python3 \
-    python-dev \
    python3-dev \
    python3-pip \
    sshpass \
    software-properties-common \
-    rocm-dev \
-    rocm-device-libs \
-    rocm-cmake \
    vim \
    nano \
    zlib1g-dev \
@@ -103,7 +105,7 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
@@ -111,7 +113,7 @@ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler
    else echo "using the release compiler"; \
    fi
-RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -493,10 +493,11 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
+                        //run tests and examples 	
+                        sh 'make -j\$(( \$(nproc) / 2 )) check'
                        if (navi_node == 0 ){
-                           //run tests and examples on all nodes except Navi
+                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                           sh 'make -j check'
+                            //do not stash profiler on Navi nodes
-                           //we only need the ckProfiler to run the performance tests, so we pack and stash it
                           sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
                           stash "ckProfiler.tar.gz"
                        }
@@ -686,12 +687,31 @@ pipeline {
        {
            parallel
            {
+                stage("Build CK and run Tests on MI100/MI200/MI300")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx908 || gfx90a") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
                stage("Build CK and run Tests on MI100/MI200")
                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -705,8 +725,8 @@ pipeline {
                    }
                    agent{ label rocmnode("navi21") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ 
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030;gfx1100;gfx1101;gfx1102" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
                    }
                    steps{

--- a/LICENSE
+++ b/LICENSE
@@ -7,7 +7,7 @@ Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
 Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
 SPDX-License-Identifier: MIT
-Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <vector>

--- a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
@@ -11,3 +11,17 @@ target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_ope
 add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
                 client_gemm_fastgelu)
+add_custom_target(client_gemm_fastgelu_generic_examples)
+add_executable(client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_dependencies(client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic 
+                 client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic)
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <vector>

--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType  = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 9)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideD1 = std::stoi(argv[7]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+    // run the generic instance
+    auto argument_ptr =
+        op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                                               d1_m_n_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    M,
+                                    N,
+                                    K,
+                                    StrideA,
+                                    StrideB,
+                                    std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                                    StrideE,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <vector>
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
        StrideA  = std::stoi(argv[4]);
        StrideB  = std::stoi(argv[5]);
        StrideD0 = std::stoi(argv[6]);
-        StrideE  = std::stoi(argv[8]);
+        StrideE  = std::stoi(argv[7]);
    }
    else
    {

--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddFastGelu;
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using EDataType  = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using ELayout  = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideE  = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 8)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideE  = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddFastGelu>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+    // run the generic instance
+    auto argument_ptr =
+        op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    M,
+                                    N,
+                                    K,
+                                    StrideA,
+                                    StrideB,
+                                    std::array<ck::index_t, 1>{StrideD0},
+                                    StrideE,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <vector>
@@ -72,7 +72,7 @@ int main(int argc, char* argv[])
        StrideA = std::stoi(argv[4]);
        StrideB = std::stoi(argv[5]);
-        StrideE = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[6]);
    }
    else
    {

--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = FastGelu;
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideE = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideE = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::FastGelu>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+    // run the generic instance
+    auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                    b_device_buf.GetDeviceBuffer(),
+                                                    {},
+                                                    e_device_buf.GetDeviceBuffer(),
+                                                    M,
+                                                    N,
+                                                    K,
+                                                    StrideA,
+                                                    StrideB,
+                                                    {},
+                                                    StrideE,
+                                                    a_element_op,
+                                                    b_element_op,
+                                                    cde_element_op);
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <vector>
@@ -172,18 +172,19 @@ int main()
            BLayout,
            CLayout>();
-    const auto normalize_ptrs =
-        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
-            CDataType,
-            ReduceDataType,
-            ReduceDataType,
-            GammaDataType,
-            BetaDataType,
-            LayerNormOutDataType>();
    std::cout << "found " << gemm_reduce_ptrs.size()
              << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
+    using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
+        ck::Tuple<LayerNormOutDataType>,
+        ck::tensor_operation::element_wise::Normalize,
+        2>;
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            NormalizeDeviceOp>::GetInstances();
    std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
    auto f_matrix_space_size =

--- a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <iostream>

--- a/client_example/04_contraction/contraction_bilinear_fp32.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_bilinear_fp64.cpp
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
+++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_scale_fp32.cpp
+++ b/client_example/04_contraction/contraction_scale_fp32.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <numeric>

--- a/client_example/04_contraction/contraction_scale_fp64.cpp
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include <iomanip>
 #include <numeric>