merge develop to attn-train-develop-qloop

76f2b6cd · danyao12 · 9b4c780a · 1ee99dca · 76f2b6cd · 76f2b6cd
Commit 76f2b6cd authored Jul 14, 2023 by danyao12
20 changed files
--- a/example/46_gemm_add_multiply/README.md
+++ b/example/46_gemm_add_multiply/README.md
+# Instructions for ```example_gemm_add_multiply_dl_fp16```
+
+## Run ```example_gemm_add_multiply_dl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
+./bin/example_gemm_add_multiply_dl_fp16 1 1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
+d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
+d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
+arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
+arg.e_grid_desc_m_n_{ 3840, 4096}
+launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
+```
--- a/example/46_gemm_add_multiply/common.hpp
+++ b/example/46_gemm_add_multiply/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddMultiply = ck::tensor_operation::element_wise::AddMultiply;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+using I8   = int8_t;
+using I32  = int32_t;
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA  = std::stoi(argv[7]);
+        problem_size.StrideB  = std::stoi(argv[8]);
+        problem_size.StrideD0 = std::stoi(argv[9]);
+        problem_size.StrideD1 = std::stoi(argv[10]);
+        problem_size.StrideE  = std::stoi(argv[11]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+                     "StrideE"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
--- a/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_dl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using D0DataType  = F16;
+using D1DataType  = F16;
+using DsDataType  = ck::Tuple<D0DataType, D1DataType>;
+using EDataType   = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::
+        //  ##################| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ##################|        |        |         |        |      Type|      Type|        Type|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ##################|        |        |         |        |          |          |            |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ##################|        |        |         |        |          |          |            |           |          |            |            |             |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmMultipleD_Dl< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_multiply_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_multiply_example(argc, argv); }
--- a/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+++ b/example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using D0DataType  = F16;
+using D1DataType  = F16;
+using DsDataType  = ck::Tuple<D0DataType, D1DataType>;
+using EDataType   = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::
+        //##############################|      A|      B|       Ds|      E| AData| BData| AccData| CShuffle|     DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|   Layout| Layout|  Type|  Type|    Type| DataType|       Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |         |       |      |      |        |         |           |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |         |       |      |      |        |         |           |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, DsLayout,    Row,   F16,   F16,     F32,      F16, DsDataType,   F16, PassThrough, PassThrough, CDEElementOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_multiply_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_multiply_example(argc, argv); }
--- a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+++ b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+#pragma once
+
+bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-1, 1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        std::cout << "wrong! this device_op instance does not support this problem" << std::endl;
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = 2_uz * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_add_multiply_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) ||
+           run_gemm_add_multiply(problem_size, config);
+}
--- a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+++ b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_example_executable(example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using C0DEElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using Acc0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+constexpr static auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using F16              = ck::half_t;
+using F32              = float;
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using D0DataType       = F16;
+using Acc0BiasDataType = ck::Tuple<D0DataType>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        C0DEElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec,    // MaskingSpecialization
+        1>;
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    int G0      = 3;
+    int G1      = 2;
+    int M       = 1024;
+    int N       = 1024;
+    int K       = 64;
+    int O       = 64;
+    float alpha = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M  = std::stoi(argv[4]);
+        N  = std::stoi(argv[5]);
+        K  = std::stoi(argv[6]);
+        O  = std::stoi(argv[7]);
+        G0 = std::stoi(argv[8]);
+        G1 = std::stoi(argv[9]);
+
+        alpha = std::stof(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 11: M, N, K, O, G0, G1\n");
+        printf("arg10: scale (alpha)\n");
+        exit(0);
+    }
+
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        M * G1 * K, K, G1 * K, 1}; // A layout [G0, M, G1, K]
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        N * G1 * K, K, G1 * K, 1}; // B0 layout [G0, N, G1, K]
+
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        N * G1 * O, O, 1, G1 * O}; // B1 layout [G0, N, G1, O]
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        M * G1 * O, O, G1 * O, 1}; // C layout [G0, M, G1, O]
+
+    // D layout [G0, M, G1, N]
+    std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
+    std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
+    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
+    std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl;
+    std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-1, 1});
+        break;
+    case 3:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+    DeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+    DeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N);
+    DeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+    DeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data());
+    b1_device_buf.ToDevice(b1_gs_os_ns.mData.data());
+    d0_device_buf.ToDevice(d0_gs_ms_ns.mData.data());
+
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto c0de_element_op = C0DEElementOp{alpha};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    auto argument = device_op.MakeArgument(
+        static_cast<const ADataType*>(a_device_buf.GetDeviceBuffer()),
+        static_cast<const B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+        static_cast<const B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+        std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+        {},                                                    // p_acc1_biases
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        std::array<std::vector<ck::index_t>, 1>{
+            d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+        std::array<std::vector<ck::index_t>, 1>{
+            d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+        {},                       // acc1_biases_gs_ms_os_lengths
+        {},                       // acc1_biases_gs_ms_os_strides
+        a_element_op,
+        b0_element_op,
+        c0de_element_op,
+        b1_element_op,
+        c_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t BatchCount = G0 * G1;
+    std::size_t flop       = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype =
+        (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + sizeof(B1DataType) * N * O +
+         sizeof(CDataType) * M * O + sizeof(D0DataType) * M * N) *
+        BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+        Tensor<ADataType> a_g_m_k({BatchCount, M, K});
+        Tensor<B0DataType> b0_g_k_n({BatchCount, K, N});
+        Tensor<B1DataType> b1_g_n_o({BatchCount, N, O});
+        Tensor<AccDataType> acc0_g_m_n({BatchCount, M, N});        // scratch object after gemm0
+        Tensor<ADataType> a1_g_m_n({BatchCount, M, N});            // scratch object after softmax
+        Tensor<CDataType> c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1
+        Tensor<D0DataType> d0_g_m_n({BatchCount, M, N});
+
+        // permute
+        a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
+            a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+        b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
+            b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
+            b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        d0_gs_ms_ns.ForEach([&](auto& self, auto idx) {
+            d0_g_m_n(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+
+        // gemm 0
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        acc0_g_m_n.ForEach([&](auto&, auto idx) {
+            c0de_element_op(acc0_g_m_n(idx), acc0_g_m_n(idx), d0_g_m_n(idx));
+        });
+        // masking
+        const auto mask = DeviceOpInstance::C0MatrixMask(N);
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(mask.IsMaskedElement(idx[1], idx[2]))
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
+
+        // softmax
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        // gemm1
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // permute
+        c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+            const size_t& g0 = idx[0];
+            const size_t& g1 = idx[1];
+
+            const size_t g = g0 * G1 + g1;
+
+            self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+        });
+
+        // default absolute error and relative error is 0.001
+        double rtol = 1e-3;
+        double atol = 1e-3;
+
+        return ck::utils::check_err(c_gs_ms_os_device_result.mData,
+                                    c_gs_ms_os_host_result.mData,
+                                    "Error: Incorrect results!",
+                                    rtol,
+                                    atol)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
--- a/example/48_pool3d_fwd/CMakeLists.txt
+++ b/example/48_pool3d_fwd/CMakeLists.txt
+add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp)
+
--- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool pool3d_test(bool do_verification,
+                 bool time_kernel,
+                 ck::index_t N,
+                 ck::index_t C,
+                 ck::index_t Z,
+                 ck::index_t Y,
+                 ck::index_t X,
+                 ck::index_t Di,
+                 ck::index_t Hi,
+                 ck::index_t Wi,
+                 ck::index_t window_stride_d,
+                 ck::index_t window_stride_h,
+                 ck::index_t window_stride_w,
+                 ck::index_t in_left_pad_d,
+                 ck::index_t in_left_pad_h,
+                 ck::index_t in_left_pad_w,
+                 ck::index_t in_right_pad_d,
+                 ck::index_t in_right_pad_h,
+                 ck::index_t in_right_pad_w)
+{
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C<
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
+            ReduceOpId,
+            OutputIndex,
+            64, // BlockSize
+            64, // ReduceMThreadClusterSize
+            1,  // ReduceKThreadClusterSize
+            4,  // ReduceMThreadSliceSize
+            1,  // ReduceKThreadSliceSize
+            4>; // InSrcOutDstVectorSize
+
+    const ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::vector<ck::index_t> window_spatial_lengths{Z, Y, X};
+    const std::vector<ck::index_t> window_strides{
+        window_stride_d, window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_d, in_right_pad_h, in_right_pad_w};
+
+    // tensor layout
+    auto f_host_tensor_descriptor = [](std::size_t N_,
+                                       std::size_t C_,
+                                       std::size_t D,
+                                       std::size_t H,
+                                       std::size_t W,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
+        {
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
+        }
+        else if constexpr(ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NDHWC>::value)
+        {
+            return HostTensorDescriptor({N_, C_, D, H, W},
+                                        {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
+        }
+    };
+
+    Tensor<InDataType> in_n_c_di_hi_wi(f_host_tensor_descriptor(N, C, Di, Hi, Wi, InLayout{}));
+    Tensor<OutDataType> out_n_c_do_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_do_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Do, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_di_hi_wi: " << in_n_c_di_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_do_ho_wo: " << out_n_c_do_ho_wo_host.mDesc << std::endl;
+
+    in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_di_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_do_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_di_hi_wi.mData.data());
+
+    auto pool         = DevicePoolFwdInstance{};
+    auto invoker_ptr  = pool.MakeInvokerPointer();
+    auto argument_ptr = pool.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+        {N, C, Di, Hi, Wi},
+        {Z, Y, X},
+        {N, C, Do, Ho, Wo},
+        {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C},
+        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C},
+        window_strides,
+        input_left_pads,
+        input_right_pads,
+        {2, 3, 4});
+
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    std::cout << "Perf: " << ave_time << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<5,
+                                                            3,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ReduceOpId,
+                                                            PropagateNan,
+                                                            OutputIndex>;
+
+        auto ref_pooling          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
+        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_di_hi_wi,
+                                                             out_n_c_do_ho_wo_host,
+                                                             out_indices_n_c_do_ho_wo_host,
+                                                             window_spatial_lengths,
+                                                             window_strides,
+                                                             input_left_pads,
+                                                             input_right_pads);
+
+        ref_pooling_invoker.Run(ref_pooling_argument);
+
+        out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_do_ho_wo_device, out_n_c_do_ho_wo_host);
+
+        if constexpr(OutputIndex)
+        {
+            out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data());
+
+            pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device,
+                                                out_indices_n_c_do_ho_wo_host);
+        };
+    }
+
+    return (pass);
+};
--- a/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+++ b/example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "pool3d_fwd_common.hpp"
+
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using ComputeDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWC;
+using OutLayout = ck::tensor_layout::convolution::NDHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Z               = 2;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Di              = 30;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_d   = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    bool pass = pool3d_test<InDataType,
+                            OutDataType,
+                            ComputeDataType,
+                            IndexDataType,
+                            InLayout,
+                            OutLayout,
+                            ReduceOpId,
+                            PropagateNan,
+                            OutputIndex>(do_verification,
+                                         time_kernel,
+                                         N,
+                                         C,
+                                         Z,
+                                         Y,
+                                         X,
+                                         Di,
+                                         Hi,
+                                         Wi,
+                                         window_stride_d,
+                                         window_stride_h,
+                                         window_stride_w,
+                                         in_left_pad_d,
+                                         in_left_pad_h,
+                                         in_left_pad_w,
+                                         in_right_pad_d,
+                                         in_right_pad_h,
+                                         in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/49_maxpool2d_bwd/CMakeLists.txt
+++ b/example/49_maxpool2d_bwd/CMakeLists.txt
+add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp)
+add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp)
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = ck::bhalf_t;
+using OutDataType     = ck::bhalf_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::bhalf_t;
+using DOutDataType    = ck::bhalf_t;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ComputeDataType,
+          typename DInDataType,
+          typename DOutDataType,
+          bool PropagateNan>
+bool maxpool_bwd_test(bool do_verification,
+                      bool time_kernel,
+                      ck::index_t N,
+                      ck::index_t C,
+                      ck::index_t Y,
+                      ck::index_t X,
+                      ck::index_t Hi,
+                      ck::index_t Wi,
+                      ck::index_t window_stride_h,
+                      ck::index_t window_stride_w,
+                      ck::index_t in_left_pad_h,
+                      ck::index_t in_left_pad_w,
+                      ck::index_t in_right_pad_h,
+                      ck::index_t in_right_pad_w)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+            InDataType,      // InDataType
+            OutDataType,     // OutDataType
+            IndexDataType,   // IndexDataType
+            ComputeDataType, // ComputeDataType
+            ck::ReduceTensorOp::MAX,
+            true, // OutputIndex
+            64,   // BlockSize
+            64,   // ReduceMThreadClusterSize
+            1,    // ReduceKThreadClusterSize
+            4,    // ReduceMThreadSliceSize
+            1,    // ReduceKThreadSliceSize
+            1>;   // InSrcOutDstVectorSize
+
+    using DeviceMaxPoolBwdInstance = ck::tensor_operation::device::
+        DeviceIndexPoolBwdImpl<DOutDataType, IndexDataType, DInDataType, 4>;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
+            using namespace ck::literals;
+            // reference need Tensor with NCHW order
+            return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+        };
+
+    // in
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
+
+    // out
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // indices
+    Tensor<IndexDataType> indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo));
+    Tensor<IndexDataType> indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // dout
+    Tensor<DOutDataType> dout_n_c_ho_wo(f_host_tensor_descriptor(N, C, Ho, Wo));
+
+    // din
+    Tensor<DInDataType> din_n_c_hi_wi_host(f_host_tensor_descriptor(N, C, Hi, Wi));
+    Tensor<DInDataType> din_n_c_hi_wi_device(f_host_tensor_descriptor(N, C, Hi, Wi));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "indices_n_c_ho_wo: " << indices_n_c_ho_wo_host.mDesc << std::endl;
+    std::cout << "dout_n_c_ho_wo: " << dout_n_c_ho_wo.mDesc << std::endl;
+    std::cout << "din_n_c_hi_wi: " << din_n_c_hi_wi_host.mDesc << std::endl;
+
+    in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+    dout_n_c_ho_wo.GenerateTensorValue(GeneratorTensor_3<DOutDataType>{-1.0, 1.0});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) *
+                                 indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem dout_device_buf(sizeof(DOutDataType) * dout_n_c_ho_wo.mDesc.GetElementSpaceSize());
+    DeviceMem din_device_buf(sizeof(DInDataType) *
+                             din_n_c_hi_wi_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    dout_device_buf.ToDevice(dout_n_c_ho_wo.mData.data());
+
+    auto pool_fwd              = DevicePoolFwdInstance{};
+    auto pool_fwd_invoker_ptr  = pool_fwd.MakeInvokerPointer();
+    auto pool_fwd_argument_ptr = pool_fwd.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        {N, C, Hi, Wi},
+        window_spatial_lengths,
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        window_strides,
+        input_left_pads,
+        input_right_pads,
+        {2, 3});
+
+    if(!pool_fwd.IsSupportedArgument(pool_fwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_fwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time_fwd =
+        pool_fwd_invoker_ptr->Run(pool_fwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    auto pool_bwd              = DeviceMaxPoolBwdInstance{};
+    auto pool_bwd_invoker_ptr  = pool_bwd.MakeInvokerPointer();
+    auto pool_bwd_argument_ptr = pool_bwd.MakeArgumentPointer(
+        static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
+        dout_n_c_ho_wo.mDesc.GetElementSpaceSize(),
+        din_n_c_hi_wi_device.mDesc.GetElementSpaceSize(),
+        window_spatial_lengths,
+        window_strides);
+
+    if(!pool_bwd.IsSupportedArgument(pool_bwd_argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! pool_bwd with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    size_t pool_bwd_workspace_sz = pool_bwd.GetWorkSpaceSize(pool_bwd_argument_ptr.get());
+    DeviceMem pool_bwd_workspace_device_buf(pool_bwd_workspace_sz);
+    pool_bwd.SetWorkSpacePointer(pool_bwd_argument_ptr.get(),
+                                 pool_bwd_workspace_device_buf.GetDeviceBuffer());
+
+    float ave_time_bwd =
+        pool_bwd_invoker_ptr->Run(pool_bwd_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Pool fwd perf: " << ave_time_fwd << " ms" << std::endl;
+    std::cout << "Pool bwd perf: " << ave_time_bwd << " ms" << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ck::ReduceTensorOp::MAX,
+                                                            PropagateNan,
+                                                            true>;
+
+        auto ref_pooling_fwd          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_fwd_invoker  = ref_pooling_fwd.MakeInvoker();
+        auto ref_pooling_fwd_argument = ref_pooling_fwd.MakeArgument(in_n_c_hi_wi,
+                                                                     out_n_c_ho_wo_host,
+                                                                     indices_n_c_ho_wo_host,
+                                                                     window_spatial_lengths,
+                                                                     window_strides,
+                                                                     input_left_pads,
+                                                                     input_right_pads);
+        ref_pooling_fwd_invoker.Run(ref_pooling_fwd_argument);
+
+        using ReferencePoolingBwdInstance =
+            ck::tensor_operation::host::ReferenceMaxPoolBwd<DOutDataType,
+                                                            IndexDataType,
+                                                            ComputeDataType,
+                                                            DInDataType,
+                                                            PassThrough>;
+
+        auto ref_pooling_bwd          = ReferencePoolingBwdInstance{};
+        auto ref_pooling_bwd_invoker  = ref_pooling_bwd.MakeInvoker();
+        auto ref_pooling_bwd_argument = ref_pooling_bwd.MakeArgument(
+            dout_n_c_ho_wo, indices_n_c_ho_wo_host, din_n_c_hi_wi_host, PassThrough{});
+
+        ref_pooling_bwd_invoker.Run(ref_pooling_bwd_argument);
+
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+        indices_device_buf.FromDevice(indices_n_c_ho_wo_device.mData.data());
+        din_device_buf.FromDevice(din_n_c_hi_wi_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(indices_n_c_ho_wo_device, indices_n_c_ho_wo_host);
+        pass = pass && ck::utils::check_err(din_n_c_hi_wi_device, din_n_c_hi_wi_host);
+    }
+
+    return (pass);
+};
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = ck::half_t;
+using DOutDataType    = ck::half_t;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 1;
+    ck::index_t window_stride_w = 1;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "maxpool2d_bwd_common.hpp"
+
+using InDataType      = float;
+using OutDataType     = float;
+using IndexDataType   = int32_t;
+using ComputeDataType = float;
+using DInDataType     = float;
+using DOutDataType    = float;
+
+static constexpr bool PropagateNan = false;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // Pool shape
+    ck::index_t N               = 1;
+    ck::index_t C               = 1;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Hi              = 32;
+    ck::index_t Wi              = 32;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 0;
+    ck::index_t in_left_pad_w   = 0;
+    ck::index_t in_right_pad_h  = 0;
+    ck::index_t in_right_pad_w  = 0;
+
+    bool pass = maxpool_bwd_test<InDataType,
+                                 OutDataType,
+                                 IndexDataType,
+                                 ComputeDataType,
+                                 DInDataType,
+                                 DOutDataType,
+                                 PropagateNan>(do_verification,
+                                               time_kernel,
+                                               N,
+                                               C,
+                                               Y,
+                                               X,
+                                               Hi,
+                                               Wi,
+                                               window_stride_h,
+                                               window_stride_w,
+                                               in_left_pad_h,
+                                               in_left_pad_w,
+                                               in_right_pad_h,
+                                               in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/50_put_element/CMakeLists.txt
+++ b/example/50_put_element/CMakeLists.txt
+add_example_executable(example_put_element_fp16 put_element_fp16.cpp)
--- a/example/50_put_element/put_element_fp16.cpp
+++ b/example/50_put_element/put_element_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using XDataType     = ck::half_t;
+using YDataType     = ck::half_t;
+using IndexDataType = int32_t;
+
+using YElementwiseOp = ck::tensor_operation::element_wise::PassThrough;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DevicePutElementImpl<XDataType,     // XDataType
+                                                       IndexDataType, // IndexDataType
+                                                       YDataType,     // YDataType
+                                                       YElementwiseOp,
+                                                       ck::InMemoryDataOperationEnum::Set,
+                                                       1>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    int N = 1024;
+
+    Tensor<XDataType> x(HostTensorDescriptor{N, 1});
+    Tensor<IndexDataType> indices(HostTensorDescriptor{N, 1});
+    Tensor<YDataType> y(HostTensorDescriptor{N, 1});
+
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0, 1.0});
+    for(int i = 0; i < N; ++i)
+        indices(i) = i;
+
+    DeviceMem x_device_buf(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_device_buf(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem indices_device_buf(sizeof(IndexDataType) * indices.mDesc.GetElementSpaceSize());
+
+    x_device_buf.ToDevice(x.mData.data());
+    indices_device_buf.ToDevice(indices.mData.data());
+
+    auto put_instance     = DeviceInstance{};
+    auto put_invoker_ptr  = put_instance.MakeInvokerPointer();
+    auto put_argument_ptr = put_instance.MakeArgumentPointer(
+        static_cast<XDataType*>(x_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
+        static_cast<YDataType*>(y_device_buf.GetDeviceBuffer()),
+        N,
+        N,
+        YElementwiseOp{});
+
+    if(!put_instance.IsSupportedArgument(put_argument_ptr.get()))
+    {
+        throw std::runtime_error("argument is not supported!");
+    }
+
+    float ave_time =
+        put_invoker_ptr->Run(put_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<YDataType> y_host(HostTensorDescriptor{N, 1});
+
+        for(int i = 0; i < N; ++i)
+        {
+            IndexDataType idx = indices(i);
+            y_host(idx)       = x(i);
+        }
+
+        y_device_buf.FromDevice(y.mData.data());
+        pass = ck::utils::check_err(y, y_host);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -18,36 +18,49 @@
 #define CK_USE_LAUNCH_BOUNDS 1

 #ifdef CK_USE_LAUNCH_BOUNDS
+// for most kernels
 #define CK_MAX_THREAD_PER_BLOCK 256
 #define CK_MIN_BLOCK_PER_CU 2
+
+// for wavelet GEMM kernel
+#define CK_WAVELET_MAX_THREAD_PER_BLOCK 512
+#define CK_WAVELET_MIN_BLOCK_PER_CU 2
+#endif
+
+// kernel attribute: amdgpu_waves_per_eu()
+#ifdef CK_USE_WAVES_PER_EU
+// for 1-wave kernels, control arguments of amdgpu_waves_per_eu() attribute
+#ifndef CK_MIN_WAVES_PER_EU
+#define CK_MIN_WAVES_PER_EU 0
 #endif

-// check GPU target
-#ifdef __HIP_DEVICE_COMPILE__
-#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-      defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__))
-#error Not supported target
+#ifndef CK_MAX_WAVES_PER_EU
+#define CK_MAX_WAVES_PER_EU 0
 #endif
+
+#else
+#define CK_USE_WAVES_PER_EU 0
 #endif

 // buffer resource
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) // for GPU code
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
+    defined(__gfx942__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
 #elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
-#elif defined(__gfx1100__) // for GPU code
-#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif

 // FMA instruction
 #ifndef __HIP_DEVICE_COMPILE__                   // for host code, define nothing
 #elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
 #define CK_USE_AMD_V_MAC_F32
-#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx1030__) // for GPU code
+#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
@@ -56,18 +69,23 @@
 // MFMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_MFMA
-#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_MFMA
 #endif

-#if defined(__gfx90a__)
+#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
 #define CK_USE_AMD_MFMA_BF16_1K_OP
 #endif

+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define CK_USE_AMD_MFMA_GFX940
+#endif
+
 // WMMA instruction
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_WMMA
-#elif defined(__gfx1100__) // for GPU code
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
 #define CK_USE_AMD_WMMA
 #endif

@@ -83,13 +101,15 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif

-#if defined(__gfx90a__) // for GPU code
+#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
@@ -143,6 +163,10 @@
 #define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
 // experimental feature: add instances using pipeline v2
 #define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
+// experimental feature: optimize pipeline v2 by IGLP strategy (value=ID of strategy)
+#ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
+#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0
+#endif

 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
@@ -163,16 +187,26 @@
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0

-// workaround: a BF16 attention kernel for gfx908 is likely affected by a compiler issue
-#ifdef __gfx908__
-#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 1
-#else // __gfx90a__, ...
-#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
-#endif // __gfx908__
+// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
+#define CK_WORKAROUND_SWDEV_383542 1
+
+// workaround: compiler issue on gfx908
+#define CK_WORKAROUND_SWDEV_388832 1
+
+// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
+#define CK_WORKAROUND_SWDEV_3318619 0

 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0

+// denorm test fix, required to work around dissue
+#ifndef CK_WORKAROUND_DENORM_FIX
+#define CK_WORKAROUND_DENORM_FIX 0
+#elif
+// enable only on MI200
+#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#endif // CK_WORKAROUND_DENORM_FIX
+
 namespace ck {

 enum struct InMemoryDataOperationEnum

--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once