initial commit

7e689d57 · aska-0096 · 7e689d57 · 7e689d57 · 7e689d57 · 7e689d57
Commit 7e689d57 authored Jul 18, 2024 by aska-0096
20 changed files
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/utility/reduction_enums.hpp"
+#include "reduce_blockwise_impl.hpp"
+#include "reduce_example_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<float> scales     = {1.0f, 0.0f};
+
+    bool do_verification = true;
+    int data_type        = 1;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)"
+                  << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+        {
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan,
+          index_t OutputIndex>
+bool reduce_blockwise_test(bool do_verification,
+                           int init_method,
+                           bool time_kernel,
+                           const std::vector<size_t>& inLengths,
+                           const std::vector<int>& reduceDims,
+                           float alpha,
+                           float beta)
+{
+    bool matched = false;
+    int result   = 0;
+
+    const auto tuple_object = reduce_shape_instances{};
+
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
+            return;
+
+        std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
+
+        ck::ranges::copy(reduceDims, arrReduceDims.begin());
+
+        result = reduce_blockwise_impl<InOutDataType,
+                                       AccDataType,
+                                       ReduceOpId,
+                                       ShapeType::Rank_,
+                                       ShapeType::NumReduceDim_,
+                                       PropagateNan,
+                                       OutputIndex>(
+            do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
+
+        matched = true;
+    });
+
+    return (result == 0) ? true : false;
+};
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = reduce_blockwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = reduce_blockwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        else if(arg.data_type == 7)
+        {
+            pass = reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+
+            pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                               arg.do_verification,
+                               arg.init_method,
+                               arg.time_kernel,
+                               arg.inLengths,
+                               arg.reduceDims,
+                               arg.scales[0],
+                               arg.scales[1]);
+        }
+#endif
+    }
+    else
+    {
+        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing float
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing double
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        // for testing int4_t using AVG operation
+        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing int4_t using MAX operation
+        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+#endif
+        // for testing 3D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+
+        // for testing 5D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
+    };
+
+    return (pass ? 0 : 1);
+};
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+
+#include "reduce_example_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan,
+          bool OutputIndex>
+int reduce_blockwise_impl(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const std::vector<size_t>& inLengths,
+                          const std::array<int, NumReduceDim>& reduceDims,
+                          float alpha,
+                          float beta)
+
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
+         ReduceOpId == ReduceTensorOp::AMAX);
+
+    constexpr bool invalid_reduce_1 = OutputIndex && !op_support_indices;
+
+    // 1) If InOutDataType is half_t, must use half_t as AccDataType for indexable reduction
+    // operations 2) If InOutDataType is half_t, must use float as AccDataType for non-indexable
+    // reduction operations
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InOutDataType, half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, half_t>::value));
+
+    // 1) If InOutDataType is float, must use float as AccDataType for indexable reduction
+    // operations
+    constexpr bool invalid_reduce_3 =
+        std::is_same<InOutDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+
+    // 1) If InOutDataType is int8_t or int4_t, must use int8_t as AccDataType for indexable
+    // reduction operations 2) If InOutDataType is int8_t or int4_t, must use int32_t as AccDataType
+    // for non-indexable reduction operations
+    constexpr bool invalid_reduce_4 =
+        std::is_same<InOutDataType, int8_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_4_2 =
+        std::is_same<InOutDataType, int4_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+#endif
+
+    // 1) If InOutDataType is int8_t or int4_t, the supported operation must be either indexable
+    // operations or ADD/AVG
+    constexpr bool invalid_reduce_5 = std::is_same<InOutDataType, int8_t>::value &&
+                                      (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                       ReduceOpId != ReduceTensorOp::AVG);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_5_2 = std::is_same<InOutDataType, int4_t>::value &&
+                                        (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                         ReduceOpId != ReduceTensorOp::AVG);
+#endif
+
+    // 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
+    constexpr bool invalid_reduce_6 =
+        std::is_same<InOutDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce =
+        (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 ||
+         invalid_reduce_5 || invalid_reduce_6 || invalid_reduce_4_2 || invalid_reduce_5_2);
+#else
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
+                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
+#endif
+
+    if constexpr(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    using InOutDataTypeInDevice = typename std::
+        conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
+#else
+    using InOutDataTypeInDevice   = InOutDataType;
+#endif
+
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataTypeInDevice,
+                                                             AccDataType,
+                                                             InOutDataTypeInDevice,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             PropagateNan,
+                                                             OutputIndex,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,   // BlockSize
+                                                             4,     // MThreadClusterSize
+                                                             64,    // KThreadClusterSize
+                                                             1,     // MThreadSliceSize
+                                                             1,     // KThreadSliceSize
+                                                             0,     // InSrcVectorDim
+                                                             1,     // InSrceVectorSize
+                                                             1>;    // OutDstVectorSize
+
+    Tensor<InOutDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+    Tensor<int> out_indices_ref(outLengths);
+    Tensor<int> out_indices(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if(std::is_same<InOutDataType, int4_t>::value)
+    {
+        std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+        std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data());
+        in_dev.ToDevice(tmp_buf.data());
+    }
+    else
+#endif
+        in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+    {
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+            std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data());
+            out_dev.ToDevice(tmp_buf.data());
+        }
+        else
+#endif
+            out_dev.ToDevice(out.mData.data());
+    };
+
+    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
+
+    DeviceMem out_index_dev(indicesSizeInBytes);
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    if(do_verification)
+    {
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
+                                                        AccDataType,
+                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperation,
+                                                        AccElementwiseOperation,
+                                                        PropagateNan,
+                                                        OutputIndex>;
+
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               static_cast<double>(alpha),
+                                                               static_cast<double>(beta),
+                                                               in.mData.data(),
+                                                               nullptr,
+                                                               out_ref.mData.data(),
+                                                               out_indices_ref.mData.data(),
+                                                               in_elementwise_op,
+                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
+
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };
+
+    auto reduce = DeviceReduceInstance{};
+
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
+                                                   arrInStrides,
+                                                   arrOutLengths,
+                                                   arrOutStrides,
+                                                   reduceDims,
+                                                   static_cast<double>(alpha),
+                                                   static_cast<double>(beta),
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   out_index_dev.GetDeviceBuffer(),
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
+                  << std::endl;
+
+        return (-2);
+    };
+
+    std::string reduce_name = reduce.GetTypeString();
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size());
+
+            out_dev.FromDevice(tmp_buf.data());
+
+            std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data());
+        }
+        else
+#endif
+            out_dev.FromDevice(out.mData.data());
+
+        pass = pass && ck::utils::check_err(out, out_ref);
+
+        if(OutputIndex)
+        {
+            out_index_dev.FromDevice(out_indices.mData.data());
+            pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
+        };
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InOutDataType = ck::half_t;
+using InOutDataType = ck::half_t;
+using AccDataType   = float;
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
+
+using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+using InElementwiseOperation =
+    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+using AccElementwiseOperation =
+    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      5, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      InElementwiseOperation,
+                                                      PassThroughOp,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      32,
+                                                      8,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      4, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      PassThroughOp,
+                                                      AccElementwiseOperation,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      128,
+                                                      2,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+static bool do_verify;
+static int init_method;
+static float alpha;
+static float beta;
+static bool time_kernel;
+
+int main(int argc, char* argv[])
+{
+    // used by the device reduction
+    const std::array<int, 1> reduceDims_1 = {4};
+    // const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
+
+    const std::array<int, 1> reduceDims_2 = {3};
+    // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
+
+    // used by the host reduction
+    const std::array<int, 2> reduceDims = {3, 4};
+    // const std::array<int, 3> invariantDims = {0, 1, 2};
+
+    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+
+    // input lengths of the second reduction, which is also the output lengths of the first
+    // reduction
+    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+
+    const std::vector<size_t> outLengths = {64, 320, 80};
+
+    if(argc == 1)
+    {
+        do_verify   = true;
+        init_method = 2;
+        time_kernel = true;
+    }
+    else if(argc == 4)
+    {
+        do_verify   = static_cast<bool>(argv[1]);
+        init_method = atoi(argv[2]);
+        time_kernel = static_cast<bool>(atoi(argv[3]));
+    }
+    else
+    {
+        std::ostringstream ostr;
+
+        ostr << "Wrong parameter! " << std::endl
+             << "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
+
+        throw std::runtime_error(ostr.str());
+    };
+
+    alpha = 1.0f;
+    beta  = 0.0f;
+
+    Tensor<InOutDataType> in_1(inLengths_1);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
+    Tensor<InOutDataType> out(outLengths);
+
+    auto inStrides_1 = in_1.mDesc.GetStrides();
+    auto inStrides_2 = in_2.mDesc.GetStrides();
+    auto outStrides  = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in_1.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verify)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
+    DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_1_dev.ToDevice(in_1.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    std::array<index_t, 5> arrInLengths_1;
+    std::array<index_t, 5> arrInStrides_1;
+    std::array<index_t, 4> arrInLengths_2;
+    std::array<index_t, 4> arrInStrides_2;
+    std::array<index_t, 3> arrOutLengths;
+    std::array<index_t, 3> arrOutStrides;
+
+    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
+    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
+    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
+    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    if(do_verify)
+    {
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
+                                                        AccDataType,
+                                                        InOutDataType,
+                                                        5,
+                                                        2,
+                                                        ReduceOperation,
+                                                        InElementwiseOperation,
+                                                        AccElementwiseOperation,
+                                                        PropagateNan,
+                                                        OutputIndex>;
+
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
+                                                               arrInStrides_1,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               static_cast<double>(alpha),
+                                                               static_cast<double>(beta),
+                                                               in_1.mData.data(),
+                                                               nullptr,
+                                                               out_ref.mData.data(),
+                                                               nullptr,
+                                                               in_elementwise_op,
+                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
+
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };
+
+    auto reduce_1 = DeviceReduceInstance_1{};
+
+    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
+                                                       arrInStrides_1,
+                                                       arrInLengths_2,
+                                                       arrInStrides_2,
+                                                       reduceDims_1,
+                                                       1.0,
+                                                       0.0,
+                                                       in_1_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       in_2_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       in_elementwise_op,
+                                                       PassThroughOp{});
+
+    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
+    {
+        std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
+                  << std::endl;
+    };
+
+    auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
+
+    auto reduce_2 = DeviceReduceInstance_2{};
+
+    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
+                                                       arrInStrides_2,
+                                                       arrOutLengths,
+                                                       arrOutStrides,
+                                                       reduceDims_2,
+                                                       static_cast<double>(alpha),
+                                                       static_cast<double>(beta),
+                                                       in_2_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       out_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       PassThroughOp{},
+                                                       acc_elementwise_op);
+
+    if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
+
+    float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
+    float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
+
+    std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
+              << reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verify)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out, out_ref);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+template <int Rank, int NumReduceDim>
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
+{
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::array<int, Rank - NumReduceDim> invariantDims;
+
+    // collect invariant dimensions
+    int dim = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims[dim] = i;
+            dim++;
+        };
+
+    return invariantDims;
+};
+
+template <ck::index_t Rank, ck::index_t NumReduceDim>
+struct ReduceShape
+{
+    static constexpr ck::index_t Rank_         = Rank;
+    static constexpr ck::index_t NumReduceDim_ = NumReduceDim;
+};
+
+using reduce_shape_instances = std::tuple<ReduceShape<3, 1>,
+                                          ReduceShape<3, 2>,
+                                          ReduceShape<4, 1>,
+                                          ReduceShape<4, 2>,
+                                          ReduceShape<4, 3>,
+                                          ReduceShape<5, 1>,
+                                          ReduceShape<5, 2>,
+                                          ReduceShape<5, 3>,
+                                          ReduceShape<5, 4>>;
--- a/example/12_reduce/reduce_multiblock_atomic_add.cpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/utility/reduction_enums.hpp"
+#include "reduce_multiblock_atomic_add_impl.hpp"
+#include "reduce_example_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<float> scales     = {1.0f, 0.0f};
+
+    bool do_verification = true;
+    int data_type        = 1;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp32, 1: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+        {
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan>
+bool reduce_multiblock_atomic_add_test(bool do_verification,
+                                       int init_method,
+                                       bool time_kernel,
+                                       const std::vector<size_t>& inLengths,
+                                       const std::vector<int>& reduceDims,
+                                       float alpha,
+                                       float beta)
+{
+    bool matched = false;
+    int result   = 0;
+
+    const auto tuple_object = reduce_shape_instances{};
+
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
+            return;
+
+        std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
+
+        ck::ranges::copy(reduceDims, a_reduceDims.begin());
+
+        result = reduce_multiblock_atomic_add_impl<InOutDataType,
+                                                   AccDataType,
+                                                   ReduceOpId,
+                                                   ShapeType::Rank_,
+                                                   ShapeType::NumReduceDim_,
+                                                   PropagateNan>(
+            do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta);
+
+        matched = true;
+    });
+
+    return (result == 0) ? true : false;
+};
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
+constexpr bool PropagateNan         = true;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+    }
+    else
+    {
+        // for testing float
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing double
+        pass = pass && reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing 3D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+
+        // for testing 5D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
+    };
+
+    return (pass ? 0 : 1);
+};
--- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+
+#include "reduce_example_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan>
+int reduce_multiblock_atomic_add_impl(bool do_verification,
+                                      int init_method,
+                                      bool time_kernel,
+                                      const std::vector<size_t>& inLengths,
+                                      const std::array<int, NumReduceDim>& reduceDims,
+                                      float alpha,
+                                      float beta)
+
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    constexpr bool op_support_atomic_add =
+        (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
+
+    constexpr bool invalid_reduce_1 = !op_support_atomic_add;
+    constexpr bool invalid_reduce_2 =
+        !(std::is_same<InOutDataType, float>::value || std::is_same<InOutDataType, double>::value);
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2);
+
+    if(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
+                                                             AccDataType,
+                                                             InOutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             PropagateNan,
+                                                             false,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,
+                                                             4,
+                                                             64,
+                                                             1,
+                                                             1,
+                                                             0,
+                                                             1,
+                                                             1>;
+
+    Tensor<InOutDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    if(do_verification)
+    {
+        using ReferenceReduceInstance =
+            ck::tensor_operation::host::ReferenceReduce<InOutDataType,
+                                                        AccDataType,
+                                                        InOutDataType,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperation,
+                                                        AccElementwiseOperation,
+                                                        PropagateNan,
+                                                        false>;
+
+        auto reduce_ref = ReferenceReduceInstance{};
+
+        auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
+                                                               arrInStrides,
+                                                               arrOutLengths,
+                                                               arrOutStrides,
+                                                               reduceDims,
+                                                               static_cast<double>(alpha),
+                                                               static_cast<double>(beta),
+                                                               in.mData.data(),
+                                                               nullptr,
+                                                               out_ref.mData.data(),
+                                                               nullptr,
+                                                               in_elementwise_op,
+                                                               acc_elementwise_op);
+
+        if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
+
+        invoker_ptr_ref->Run(argument_ptr_ref.get());
+    };
+
+    auto reduce = DeviceReduceInstance{};
+
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
+                                                   arrInStrides,
+                                                   arrOutLengths,
+                                                   arrOutStrides,
+                                                   reduceDims,
+                                                   static_cast<double>(alpha),
+                                                   static_cast<double>(beta),
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
+                  << std::endl;
+
+        return (-2);
+    };
+
+    std::string reduce_name = reduce.GetTypeString();
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out, out_ref);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)
+endif()
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
+# Instructions for ```example_pool2d_fwd``` Examples
+
+## Run ```example_pool2d_fwd_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./bin/example_pool2d_fwd_fp16 1 1 1
+```
+
+Result 
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
+```
+
+## Run ```example_pool2d_fwd_fp32```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./bin/example_pool2d_fwd_fp32 1 1 1
+```
+
+
+Result 
+```
+./bin/example_pool2d_fwd_fp32 1 1 1
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
+```
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename ComputeDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool pool_test(bool do_verification,
+               int init_method,
+               bool time_kernel,
+               ck::index_t N,
+               ck::index_t C,
+               ck::index_t Y,
+               ck::index_t X,
+               ck::index_t Hi,
+               ck::index_t Wi,
+               ck::index_t window_stride_h,
+               ck::index_t window_stride_w,
+               ck::index_t window_dilation_h,
+               ck::index_t window_dilation_w,
+               ck::index_t in_left_pad_h,
+               ck::index_t in_left_pad_w,
+               ck::index_t in_right_pad_h,
+               ck::index_t in_right_pad_w)
+{
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<InDataType,
+                                                                OutDataType,
+                                                                IndexDataType,
+                                                                ComputeDataType,
+                                                                ReduceOpId,
+                                                                OutputIndex,
+                                                                64, // BlockSize
+                                                                64, // ReduceMThreadClusterSize
+                                                                1,  // ReduceKThreadClusterSize
+                                                                4,  // ReduceMThreadSliceSize
+                                                                1,  // ReduceKThreadSliceSize
+                                                                1>; // InSrcOutDstVectorSize
+
+    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
+    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
+
+    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
+    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
+    const std::vector<ck::index_t> window_dilations{window_dilation_h, window_dilation_w};
+    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
+    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
+
+    // tensor layout
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            using namespace ck::literals;
+
+            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+            }
+            else if constexpr(ck::is_same<decltype(layout),
+                                          ck::tensor_layout::convolution::NHWC>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
+    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+
+    auto pool         = DevicePoolFwdInstance{};
+    auto invoker_ptr  = pool.MakeInvokerPointer();
+    auto argument_ptr = pool.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+        {N, C, Hi, Wi},
+        {Y, X},
+        {N, C, Ho, Wo},
+        {C * Hi * Wi, 1, Wi * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        {C * Ho * Wo, 1, Wo * C, C},
+        window_strides,
+        window_dilations,
+        input_left_pads,
+        input_right_pads,
+        {2, 3});
+
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
+
+    std::size_t num_btype =
+        sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+              << " GB / s " << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferencePoolingFwdInstance =
+            ck::tensor_operation::host::ReferencePoolingFwd<4,
+                                                            2,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ComputeDataType,
+                                                            IndexDataType,
+                                                            ReduceOpId,
+                                                            PropagateNan,
+                                                            OutputIndex>;
+
+        auto ref_pooling          = ReferencePoolingFwdInstance{};
+        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
+        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_hi_wi,
+                                                             out_n_c_ho_wo_host,
+                                                             out_indices_n_c_ho_wo_host,
+                                                             window_spatial_lengths,
+                                                             window_strides,
+                                                             window_dilations,
+                                                             input_left_pads,
+                                                             input_right_pads);
+
+        ref_pooling_invoker.Run(ref_pooling_argument);
+
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
+
+        if constexpr(OutputIndex)
+        {
+            out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+
+            pass = pass &&
+                   ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
+        };
+    }
+
+    return (pass);
+};
--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType      = ck::half_t;
+using OutDataType     = ck::half_t;
+using ComputeDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N                 = 128;
+    ck::index_t C                 = 192;
+    ck::index_t Y                 = 3;
+    ck::index_t X                 = 3;
+    ck::index_t Hi                = 71;
+    ck::index_t Wi                = 71;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 18)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N                 = std::stoi(argv[4]);
+        C                 = std::stoi(argv[5]);
+        Y                 = std::stoi(argv[6]);
+        X                 = std::stoi(argv[7]);
+        Hi                = std::stoi(argv[8]);
+        Wi                = std::stoi(argv[9]);
+        window_stride_h   = std::stoi(argv[10]);
+        window_stride_w   = std::stoi(argv[11]);
+        window_dilation_h = std::stoi(argv[12]);
+        window_dilation_w = std::stoi(argv[13]);
+        in_left_pad_h     = std::stoi(argv[14]);
+        in_left_pad_w     = std::stoi(argv[15]);
+        in_right_pad_h    = std::stoi(argv[16]);
+        in_right_pad_w    = std::stoi(argv[17]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          ComputeDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       window_dilation_h,
+                                       window_dilation_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType      = float;
+using OutDataType     = float;
+using ComputeDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N                 = 128;
+    ck::index_t C                 = 192;
+    ck::index_t Y                 = 3;
+    ck::index_t X                 = 3;
+    ck::index_t Hi                = 71;
+    ck::index_t Wi                = 71;
+    ck::index_t window_stride_h   = 2;
+    ck::index_t window_stride_w   = 2;
+    ck::index_t window_dilation_h = 1;
+    ck::index_t window_dilation_w = 1;
+    ck::index_t in_left_pad_h     = 1;
+    ck::index_t in_left_pad_w     = 1;
+    ck::index_t in_right_pad_h    = 1;
+    ck::index_t in_right_pad_w    = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 18)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N                 = std::stoi(argv[4]);
+        C                 = std::stoi(argv[5]);
+        Y                 = std::stoi(argv[6]);
+        X                 = std::stoi(argv[7]);
+        Hi                = std::stoi(argv[8]);
+        Wi                = std::stoi(argv[9]);
+        window_stride_h   = std::stoi(argv[10]);
+        window_stride_w   = std::stoi(argv[11]);
+        window_dilation_h = std::stoi(argv[12]);
+        window_dilation_w = std::stoi(argv[13]);
+        in_left_pad_h     = std::stoi(argv[14]);
+        in_left_pad_w     = std::stoi(argv[15]);
+        in_right_pad_h    = std::stoi(argv[16]);
+        in_right_pad_w    = std::stoi(argv[17]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          ComputeDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       window_dilation_h,
+                                       window_dilation_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+# dlops
+if(DL_KERNELS)
+  add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
+endif()
+
+# xdlops
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
+   add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
+   set(target 1)
+ endif()
+endforeach()
+endif()
\ No newline at end of file
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Dl<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    AccDataType,
+    DsDataType,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmDefault,
+    256,                 // BlockSize
+    128,                 // MPerBlock
+    128,                 // NPerBlock
+    16,                  // K0PerBlock
+    4,                   // K1
+    4,                   // M1PerThread
+    4,                   // N1PerThread
+    1,                   // KPerThread
+    S<8, 2>,             // M1N1ThreadClusterM1Xs
+    S<8, 2>,             // M1N1ThreadClusterN1Xs
+    S<8, 1, 1, 4>,       // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
+    S<2, 1, 128, 1>,     // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
+    S<1, 2, 0, 3>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 2, 0, 3>,       // ABlockTransferSrcAccessOrder
+    S<4, 1, 1, 4>,       // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
+    S<1, 2, 0, 3>,       // ABlockTransferSrcVectorTensorContiguousDimOrder
+    S<1, 1, 1, 4>,       // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
+    S<8, 1, 1, 4>,       // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
+    S<2, 1, 128, 1>,     // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
+    S<1, 2, 0, 3>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 2, 0, 3>,       // BBlockTransferSrcAccessOrder
+    S<4, 1, 1, 4>,       // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
+    S<1, 2, 0, 3>,       // BBlockTransferSrcVectorTensorContiguousDimOrder
+    S<1, 1, 1, 4>,       // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
+    S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
+    5,                   // CThreadTransferSrcDstVectorDim
+    4>;                  // CThreadTransferDstScalarPerVector
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using CDEElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using BiasDataType     = I32;
+using DsDataType       = ck::Tuple<BiasDataType>;
+using EDataType        = I8;
+
+using ALayout    = Row;
+using BLayout    = Col;
+using BiasLayout = Row;
+using DsLayout   = ck::Tuple<BiasLayout>;
+using ELayout    = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
+     ALayout,
+     BLayout,
+     DsLayout,
+     ELayout,
+     ADataType,
+     BDataType,
+     AccDataType,
+     CShuffleDataType,
+     DsDataType,
+     EDataType,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
+     GemmDefault,                // GemmSpecialization GemmSpec,
+     1,                          // NumGemmKPrefetchStage,
+     256,                        // BlockSize,
+     256,                        // MPerBlock,
+     128,                        // NPerBlock,
+     64,                         // KPerBlock,
+     16,                         // AK1,
+     16,                         // BK1,
+     32,                         // MPerXDL,
+     32,                         // NPerXDL,
+     4,                          // MXdlPerWave,
+     2,                          // NXdlPerWave,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
+     2,                          // index_t ABlockTransferSrcVectorDim,
+     16,                         // index_t ABlockTransferSrcScalarPerVector,
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
+     1,                          // bool ABlockLdsExtraM,
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder,
+     2,                          // index_t BBlockTransferSrcVectorDim,
+     8,                          // index_t BBlockTransferSrcScalarPerVector,
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1,
+     1,                          // bool BBlockLdsExtraN,
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA    = 1024;
+    ck::index_t StrideB    = 1024;
+    ck::index_t StrideBias = 0;
+    ck::index_t StrideE    = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "bias_n: " << bias_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
+    bias_n.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {bias_device_buf.GetDeviceBuffer()},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {StrideBias},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), bias_n(n));
+            }
+        }
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
+     ALayout,
+     BLayout,
+     DsLayout,
+     ELayout,
+     ADataType,
+     BDataType,
+     AccDataType,
+     CShuffleDataType,
+     DsDataType,
+     EDataType,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
+     GemmDefault,                // GemmSpecialization GemmSpec,
+     1,                          // NumGemmKPrefetchStage,
+     256,                        // BlockSize,
+     256,                        // MPerBlock,
+     128,                        // NPerBlock,
+     64,                         // KPerBlock,
+     16,                         // AK1,
+     16,                         // BK1,
+     32,                         // MPerXDL,
+     32,                         // NPerXDL,
+     4,                          // MXdlPerWave,
+     2,                          // NXdlPerWave,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
+     2,                          // index_t ABlockTransferSrcVectorDim,
+     16,                         // index_t ABlockTransferSrcScalarPerVector,
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
+     1,                          // bool ABlockLdsExtraM,
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder,
+     2,                          // index_t BBlockTransferSrcVectorDim,
+     8,                          // index_t BBlockTransferSrcScalarPerVector,
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1,
+     1,                          // bool BBlockLdsExtraN,
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
+add_custom_target(example_grouped_gemm_xdl)
+
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32)
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
+  add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
+  add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)
+  add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp16 grouped_gemm_xdl_fixed_nk_fp16.cpp)
+  add_example_executable(example_grouped_gemm_xdl_fixed_nk_bias_fp16 grouped_gemm_xdl_fixed_nk_bias_fp16.cpp)
+  add_dependencies(example_grouped_gemm_xdl 
+                   example_grouped_gemm_xdl_fp16
+                   example_grouped_gemm_multiple_d_dl_fp16
+                   example_grouped_gemm_xdl_splitk_fp16
+                   example_grouped_gemm_xdl_fixed_nk_fp16
+                   example_grouped_gemm_xdl_fixed_nk_bias_fp16)
+endif()
+if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_bfp16)
+endif()
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+  add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int8)
+endif()
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
+endif()
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
+# Instructions for ```example_grouped_gemm_xdl```
+
+## Run ```example_grouped_gemm_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+./bin/example_grouped_gemm_xdl_fp16 0 1 5
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
+gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
+gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
+gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
+group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
+group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
+group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
+group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
+launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 5 times...
+Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
+```
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstddef>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/sequence.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using DsDataType  = ck::Tuple<>;
+using EDataType   = F16;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::
+        //   ##################| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|     DsData|     EData|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //   ##################|        |        |         |        |      Type|      Type|        Type|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //   ##################|        |        |         |        |          |          |            |           |          |   Operation|   Operation|    Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //   ##################|        |        |         |        |          |          |            |           |          |            |            |             |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+  DeviceGroupedGemmMultipleD_Dl< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Row;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+using ELayout  = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+
+using CDEElementOp = Add;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl_Fixed_NK
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   128,    16,   128,    32,   8,   8,   16,   16,    1,    4,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              4>;
+// clang-format on
+
+struct ProblemSize final
+{
+    std::vector<ck::index_t> Ms;
+    std::vector<ck::index_t> Ns;
+    std::vector<ck::index_t> Ks;
+
+    std::vector<ck::index_t> stride_As;
+    std::vector<ck::index_t> stride_Bs;
+    std::vector<ck::index_t> stride_Cs;
+
+    ck::index_t group_count;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int k_batch          = 1;
+};
+
+bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    auto group_count = problem_size.group_count;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    int sum_of_m = 0;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<BDataType>> b_tensors;
+    std::vector<Tensor<D0DataType>> d0_tensors;
+    std::vector<Tensor<EDataType>> c_host_tensors;
+    std::vector<Tensor<EDataType>> c_device_tensors;
+
+    a_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    d0_tensors.reserve(group_count);
+    c_host_tensors.reserve(group_count);
+    c_device_tensors.reserve(group_count);
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, d0_tensors_device,
+        c_tensors_device;
+
+    a_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    d0_tensors_device.reserve(group_count);
+    c_tensors_device.reserve(group_count);
+
+    std::size_t flop = 0, num_btype = 0;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        sum_of_m += problem_size.Ms[i];
+        a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ks[i], problem_size.stride_As[i], ALayout{})));
+        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
+            problem_size.Ks[i], problem_size.Ns[i], problem_size.stride_Bs[i], BLayout{})));
+        d0_tensors.push_back(Tensor<D0DataType>(
+            f_host_tensor_descriptor(problem_size.Ms[i], problem_size.Ns[i], 0, ELayout{})));
+        c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+        c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
+                  << " b_k_n: " << b_tensors[i].mDesc << " d_m_n: " << d0_tensors[i].mDesc
+                  << " c_m_n: " << c_device_tensors[i].mDesc << std::endl;
+
+        flop += std::size_t(2) * problem_size.Ms[i] * problem_size.Ks[i] * problem_size.Ns[i];
+        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
+                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(D0DataType) * d0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize();
+
+        switch(config.init_method)
+        {
+        case 0: break;
+        case 1:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            break;
+        case 2:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            break;
+        default:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        }
+
+        d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+    }
+
+    using GroupedGemmKernelArgument = ck::tensor_operation::device::GroupedGemmKernelArgument<1>;
+
+    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
+    grouped_gemm_kernel_args_.reserve(group_count);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        a_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * sum_of_m * problem_size.Ks[i]));
+
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(BDataType) * problem_size.Ns[i] * problem_size.Ks[i]));
+
+        d0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(D0DataType) * problem_size.Ns[i]));
+
+        c_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(EDataType) * sum_of_m * problem_size.Ns[i]));
+
+        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data(),
+                                      a_tensors[i].mDesc.GetElementSpaceSize() * sizeof(ADataType));
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data(),
+                                      b_tensors[i].mDesc.GetElementSpaceSize() * sizeof(BDataType));
+        d0_tensors_device[i]->ToDevice(d0_tensors[i].mData.data());
+        c_tensors_device[i]->SetZero();
+
+        gemm_descs.push_back({sum_of_m,
+                              problem_size.Ns[i],
+                              problem_size.Ks[i],
+                              1,
+                              problem_size.stride_Bs[i],
+                              1,
+                              {0}});
+
+        grouped_gemm_kernel_args_.push_back(
+            {a_tensors_device[i]->GetDeviceBuffer(),
+             b_tensors_device[i]->GetDeviceBuffer(),
+             std::array<const void*, 1>{d0_tensors_device[i]->GetDeviceBuffer()},
+             c_tensors_device[i]->GetDeviceBuffer(),
+             problem_size.Ms[i],
+             problem_size.Ns[i],
+             problem_size.Ks[i],
+             problem_size.stride_As[i],
+             problem_size.stride_Bs[i],
+             std::array<ck::index_t, 1>{0},
+             problem_size.stride_Cs[i]});
+    }
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    std::vector<const void*> p_As                = {};
+    std::vector<const void*> p_Bs                = {};
+    std::vector<std::array<const void*, 1>> p_Ds = {};
+    std::vector<void*> p_Cs                      = {};
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(
+        p_As, p_Bs, p_Ds, p_Cs, gemm_descs, a_element_op, b_element_op, cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    DeviceMem gemm_workspace_dev(gemm.GetWorkSpaceSize(&argument));
+    gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer());
+
+    DeviceMem gemm_kernel_args_dev(gemm.GetDeviceKernelArgSize(&argument));
+    hip_check_error(hipMemcpy(gemm_kernel_args_dev.GetDeviceBuffer(),
+                              grouped_gemm_kernel_args_.data(),
+                              gemm.GetDeviceKernelArgSize(&argument),
+                              hipMemcpyHostToDevice));
+
+    gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer());
+    gemm.SetKBatch(argument, config.k_batch);
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    if(config.time_kernel)
+    {
+        float ave_time   = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                EDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(),
+                                            c_device_tensors[i].mDesc.GetElementSize() *
+                                                sizeof(EDataType));
+
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
+                                                      b_tensors[i],
+                                                      c_host_tensors[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      PassThrough{});
+
+            ref_invoker.Run(ref_argument);
+
+            for(int m = 0; m < problem_size.Ms[i]; ++m)
+            {
+                for(int n = 0; n < problem_size.Ns[i]; ++n)
+                {
+                    cde_element_op(
+                        c_host_tensors[i](m, n), c_host_tensors[i](m, n), d0_tensors[i](m, n));
+                }
+            }
+
+            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
+        }
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    problem_size.Ms = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ns.push_back(768);
+        problem_size.Ks.push_back(4608);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ns[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        exit(0);
+    }
+
+    return !run_grouped_gemm(problem_size, config);
+}