Merge branch 'develop' into jizhan/reduce_threadwise_multi_d

300337cd · letaoqin · f306d02e · 02fa2c29 · 300337cd · 300337cd
Commit 300337cd authored May 30, 2024 by letaoqin
20 changed files
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -78,6 +78,9 @@ using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWe
                                                                                     InElementOp,
                                                                                     WeiElementOp,
                                                                                     OutElementOp,
+                                                                                     0,
+                                                                                     0,
+                                                                                     0,
                                                                                     ComputeTypeA,
                                                                                     ComputeTypeB>;


--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -119,7 +119,10 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
                                                  conv_param.input_right_pads_,
                                                  InElementOp{},
                                                  WeiElementOp{},
-                                                  OutElementOp{});
+                                                  OutElementOp{},
+                                                  {},
+                                                  {},
+                                                  {});

        ref_invoker.Run(ref_argument);


--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -9,7 +9,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/device_memory.hpp"
@@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
    ck::Tuple<LayerNormOutDataType>, // y
    NormalizeFunctor,
    2,
-    8,                           // MPerthread
-    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
-    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
+    64,                          // BlockSize
+    16,                          // MPerBlock
+    16,                          // NPerBlock
+    2,                           // MPerthread
+    2,                           // NPerthread
+    ck::Sequence<1, 0>,          // ThreadClusterArrangeOrder
+    ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<2>>;            // scalarPerVector: y(layerNorm_out)

 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
    return HostTensorDescriptor({len}, {stride});

--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -9,7 +9,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/device_memory.hpp"
@@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
    ck::Tuple<LayerNormOutDataType>, // y
    NormalizeFunctor,
    2,
-    8,                           // MPerthread
-    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
-    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
+    64,                          // BlockSize
+    16,                          // MPerBlock
+    16,                          // NPerBlock
+    2,                           // MPerthread
+    2,                           // NPerthread
+    ck::Sequence<1, 0>,          // ThreadClusterArrangeOrder
+    ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<2>>;            // scalarPerVector: y(layerNorm_out)

 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
    return HostTensorDescriptor({len}, {stride});

--- a/example/26_contraction/README.md
+++ b/example/26_contraction/README.md
@@ -7,14 +7,3 @@
 #arg3: time kernel (0=no, 1=yes)
 ./bin/example_contraction_bilinear_xdl_fp32 1 1 1
 ```
-
-Result (MI100 @ dynammic freq, 46TFlops peak FP32)
-```
-a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
-c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
-```
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
@@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims):
 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```

-Result (MI100)
-```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default>
-```
--- a/example/34_batchnorm/batchnorm_infer_impl.hpp
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -10,7 +10,7 @@
 #include "ck/utility/sequence.hpp"
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/reduction_operator.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "batchnorm_common.hpp"

@@ -54,7 +54,12 @@ int bnorm_infer(
        ck::Tuple<YDataType>,                                                     // y
        NormalizeInInfer,
        Rank,
-        2,                           // MPerthread
+        64,                          // BlockSize
+        32,                          // MPerBlock
+        32,                          // NPerBlock
+        4,                           // MPerthread
+        4,                           // NPerthread
+        ck::Sequence<1, 0>,          // ThreadClusterArrangeOrder
        ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
        ck::Sequence<1>>;            // scalarPerVector: y


--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
 add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
-add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
 add_example_executable(example_elementwise_permute_4D_fp32_row elementwise_permute_4D_fp32_row.cpp)
 add_example_executable(example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp)
 add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp)
 add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
 add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
 add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
-add_example_executable(example_elementwise_permute elementwise_permute.cpp)
-if((NOT GPU_TARGETS MATCHES "gfx940") AND (NOT GPU_TARGETS MATCHES "gfx941") AND (NOT GPU_TARGETS MATCHES "gfx942"))
-    add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
-endif()
--- a/example/44_elementwise_permute/elementwise_permute.cpp
+++ b/example/44_elementwise_permute/elementwise_permute.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
-
-#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
-
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using ADataType = F16;
-using BDataType = F16;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        5,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
-
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-
-    std::vector<std::size_t> ncdhw = {16, 8, 8, 8, 8};
-    std::vector<std::size_t> ndhwc = {16, 8, 8, 8, 8};
-    std::array<ck::index_t, 5> ab_lengths;
-
-    std::array<ck::index_t, 5> a_strides = {
-        static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[3] * ncdhw[4]),
-        static_cast<int>(ncdhw[4]),
-        1,
-        static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4])};
-
-    std::array<ck::index_t, 5> b_strides = {
-        static_cast<int>(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[2] * ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[3] * ndhwc[4]),
-        static_cast<int>(ndhwc[4]),
-        1};
-    ck::ranges::copy(ncdhw, ab_lengths.begin());
-
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
-    Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
-
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-
-    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
-    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
-
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
-
-    std::size_t num_btype =
-        sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
-        sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    bool pass = true;
-
-    if(do_verification)
-    {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
-        using ReferenceElementwiseInstance =
-            ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
-        auto ref_elementwise = ReferenceElementwiseInstance{};
-        auto ref_invoker     = ref_elementwise.MakeInvoker();
-
-        auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{});
-        ref_invoker.Run(ref_argument);
-
-        b_device_buf.FromDevice(b.mData.data());
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-
-    return pass ? 0 : 1;
-}
--- a/example/44_elementwise_permute/elementwise_permute_3d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_3d.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
-
-#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
-
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using ADataType = F32;
-using BDataType = F32;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                          PassThrough,          // ElementwiseOp
-                                                          2,                    // NumDim_m, {N, C}
-                                                          2,                    // NumDim_n, {H, W}
-                                                          1,                    // NumDim_k, {D}
-                                                          4,                    // MPerThread
-                                                          4,                    // NPerThread
-                                                          4,                    // KPerThread
-                                                          ck::Sequence<4>,  // InScalarPerVectorSeq
-                                                          ck::Sequence<4>>; // OutScalarPerVectorSeq
-
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-
-    const int N = 4;
-    const int C = 16;
-    const int H = 32;
-    const int W = 5;
-    const int D = 16;
-
-    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
-    std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
-    std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C
-
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
-    Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
-
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-
-    std::cout << "A (ncdhw): " << a.mDesc << std::endl;
-    std::cout << "B (ndhwc): " << b.mDesc << std::endl;
-
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-    std::size_t flop = std::size_t(2) * ab_lengths[0] * ab_lengths[1] * ab_lengths[2] *
-                       ab_lengths[3] * ab_lengths[4];
-
-    std::size_t num_btype =
-        (sizeof(ADataType) + sizeof(BDataType)) *
-        (ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3] * ab_lengths[4]);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    bool pass = true;
-
-    if(do_verification)
-    {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
-
-        using ReferenceElementwiseInstance =
-            ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
-        auto ref_elementwise = ReferenceElementwiseInstance{};
-        auto ref_invoker     = ref_elementwise.MakeInvoker();
-
-        auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{});
-        ref_invoker.Run(ref_argument);
-
-        b_device_buf.FromDevice(b.mData.data());
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-
-    return pass ? 0 : 1;
-}
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
-
-#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-using F16 = ck::half_t;
-
-using ADataType = F16;
-using BDataType = F16;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                          PassThrough,          // Elementwise op
-                                                          3,                    // NumDim_M
-                                                          1,                    // NumDim_N
-                                                          1,                    // MPerThread
-                                                          1,                    // NPerThread
-                                                          ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                          ck::Sequence<1>>; // OutScalarPerVectorSeq
-
-int main()
-{
-    bool do_verification = true;
-    bool time_kernel     = true;
-
-    const int N = 120;
-    const int C = 128;
-    const int H = 32;
-    const int W = 32;
-
-    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
-
-    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
-    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
-
-    std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
-    Tensor<ADataType>& a                = as[0];
-    Tensor<BDataType> b(ab_lengths, b_strides);
-
-    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-
-    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(
-        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
-
-    if(!broadcastPermute.IsSupportedArgument(argument.get()))
-    {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the device instance, exiting!");
-    };
-
-    std::cout << "A (nchw): " << a.mDesc << std::endl;
-    std::cout << "B (nhwc): " << b.mDesc << std::endl;
-
-    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-    float ave_time =
-        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
-
-    std::size_t flop =
-        std::size_t(2) * ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3];
-
-    std::size_t num_btype = (sizeof(ADataType) + sizeof(BDataType)) *
-                            (ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3]);
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    bool pass = true;
-
-    if(do_verification)
-    {
-        Tensor<BDataType> host_b(ab_lengths, b_strides);
-        using ReferenceElementwiseInstance =
-            ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
-        auto ref_elementwise = ReferenceElementwiseInstance{};
-        auto ref_invoker     = ref_elementwise.MakeInvoker();
-
-        auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{});
-        ref_invoker.Run(ref_argument);
-
-        b_device_buf.FromDevice(b.mData.data());
-        pass &=
-            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    }
-
-    return pass ? 0 : 1;
-}
--- a/example/46_gemm_add_multiply/README.md
+++ b/example/46_gemm_add_multiply/README.md
@@ -8,19 +8,3 @@
 #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
 ./bin/example_gemm_add_multiply_dl_fp16 1 1 1
 ```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
-d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.e_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
-```
--- a/example/59_grouped_gemm_multi_ABD/CMakeLists.txt
+++ b/example/59_grouped_gemm_multi_ABD/CMakeLists.txt
+add_custom_target(example_grouped_gemm_xdl_multi_abd)
+
+add_example_executable(example_grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16 grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp)
+add_example_dependencies(example_grouped_gemm_xdl_multi_abd example_grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16)
+
+add_example_executable(example_grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8 grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp)
+add_example_dependencies(example_grouped_gemm_xdl_multi_abd example_grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8)
--- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
+++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Col;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using DsLayout = ck::Tuple<Row>;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = Multiply;
+using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   128,    16,   128,    32,   8,   8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               1>;
+
+// clang-format on
+
+struct ProblemSize final
+{
+    std::vector<ck::index_t> Ms;
+    std::vector<ck::index_t> Ns;
+    std::vector<ck::index_t> Ks;
+
+    std::vector<ck::index_t> stride_As;
+    std::vector<ck::index_t> stride_Bs;
+    std::vector<ck::index_t> stride_Cs;
+
+    ck::index_t group_count;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int k_batch          = 1;
+};
+
+bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    auto group_count = problem_size.group_count;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmMultiABDDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    int sum_of_m = 0;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::vector<Tensor<A0DataType>> a0_tensors;
+    std::vector<Tensor<B1DataType>> b_tensors;
+    std::vector<Tensor<B0DataType>> b0_tensors;
+    std::vector<Tensor<B1DataType>> b1_tensors;
+    std::vector<Tensor<D0DataType>> d0_tensors;
+    std::vector<Tensor<EDataType>> c_host_tensors;
+    std::vector<Tensor<EDataType>> c_device_tensors;
+
+    a0_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    b0_tensors.reserve(group_count);
+    b1_tensors.reserve(group_count);
+    d0_tensors.reserve(group_count);
+    c_host_tensors.reserve(group_count);
+    c_device_tensors.reserve(group_count);
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a0_tensors_device, b0_tensors_device, b1_tensors_device,
+        d0_tensors_device, c_tensors_device;
+
+    a0_tensors_device.reserve(group_count);
+    b0_tensors_device.reserve(group_count);
+    b1_tensors_device.reserve(group_count);
+    d0_tensors_device.reserve(group_count);
+    c_tensors_device.reserve(group_count);
+
+    std::size_t flop = 0, num_btype = 0;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        sum_of_m += problem_size.Ms[i];
+
+        a0_tensors.push_back(Tensor<A0DataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ks[i], problem_size.stride_As[i], A0Layout{})));
+
+        b_tensors.push_back(Tensor<B1DataType>(f_host_tensor_descriptor(
+            problem_size.Ks[i], problem_size.Ns[i], problem_size.stride_Bs[i], B0Layout{})));
+        b0_tensors.push_back(Tensor<B0DataType>(f_host_tensor_descriptor(
+            problem_size.Ks[i], problem_size.Ns[i], problem_size.stride_Bs[i], B0Layout{})));
+        b1_tensors.push_back(Tensor<B1DataType>(
+            f_host_tensor_descriptor(problem_size.Ks[i], problem_size.Ns[i], 0, B1Layout{})));
+
+        d0_tensors.push_back(Tensor<D0DataType>(
+            f_host_tensor_descriptor(problem_size.Ms[i], problem_size.Ns[i], 0, ELayout{})));
+
+        c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+        c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+
+        std::cout << "gemm[" << i << "] a_m_k: " << a0_tensors[i].mDesc
+                  << " b_k_n: " << b0_tensors[i].mDesc << " d_m_n: " << d0_tensors[i].mDesc
+                  << " c_m_n: " << c_device_tensors[i].mDesc << std::endl;
+
+        flop += std::size_t(2) * problem_size.Ms[i] * problem_size.Ks[i] * problem_size.Ns[i];
+        num_btype += sizeof(A0DataType) * a0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(B0DataType) * b0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(B1DataType) * b1_tensors[i].mDesc.GetElementSize() +
+                     sizeof(D0DataType) * d0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize();
+
+        switch(config.init_method)
+        {
+        case 0: break;
+        case 1:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+            b0_tensors[i].GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+            b1_tensors[i].GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+            break;
+        case 2:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+            b0_tensors[i].GenerateTensorValue(GeneratorTensor_3<B0DataType>{-5, 5});
+            b1_tensors[i].GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+            break;
+        default:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        }
+
+        d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 2;
+    constexpr ck::index_t NumDTensor = 1;
+
+    using GroupedGemmKernelArgument = ck::tensor_operation::device::
+        GroupedGemmMultiABDKernelArgument<NumATensor, NumBTensor, NumDTensor>;
+
+    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
+    grouped_gemm_kernel_args_.reserve(group_count);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        a0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(A0DataType) * sum_of_m * problem_size.Ks[i]));
+
+        b0_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(B0DataType) * problem_size.Ns[i] * problem_size.Ks[i]));
+
+        b1_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(B1DataType) * problem_size.Ns[i]));
+
+        d0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(D0DataType) * problem_size.Ns[i]));
+
+        c_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(EDataType) * sum_of_m * problem_size.Ns[i]));
+
+        a0_tensors_device[i]->ToDevice(a0_tensors[i].mData.data(),
+                                       a0_tensors[i].mDesc.GetElementSpaceSize() *
+                                           sizeof(A0DataType));
+
+        b0_tensors_device[i]->ToDevice(b0_tensors[i].mData.data(),
+                                       b0_tensors[i].mDesc.GetElementSpaceSize() *
+                                           sizeof(B0DataType));
+
+        b1_tensors_device[i]->ToDevice(b1_tensors[i].mData.data(),
+                                       b1_tensors[i].mDesc.GetElementSpaceSize() *
+                                           sizeof(B1DataType));
+
+        d0_tensors_device[i]->ToDevice(d0_tensors[i].mData.data());
+        c_tensors_device[i]->SetZero();
+
+        gemm_descs.push_back(
+            {sum_of_m, problem_size.Ns[i], problem_size.Ks[i], {1}, {1, 1}, {0}, 1});
+
+        grouped_gemm_kernel_args_.push_back(
+            {std::array<const void*, NumATensor>{a0_tensors_device[i]->GetDeviceBuffer()},
+             std::array<const void*, NumBTensor>{b0_tensors_device[i]->GetDeviceBuffer(),
+                                                 b1_tensors_device[i]->GetDeviceBuffer()},
+             std::array<const void*, NumDTensor>{d0_tensors_device[i]->GetDeviceBuffer()},
+             c_tensors_device[i]->GetDeviceBuffer(),
+             problem_size.Ms[i],
+             problem_size.Ns[i],
+             problem_size.Ks[i],
+             std::array<ck::index_t, NumATensor>{problem_size.stride_As[i]},
+             std::array<ck::index_t, NumBTensor>{problem_size.stride_Bs[i], 0},
+             std::array<ck::index_t, NumDTensor>{0},
+             problem_size.stride_Cs[i]});
+    }
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    std::vector<std::array<const void*, NumATensor>> p_As = {};
+    std::vector<std::array<const void*, NumBTensor>> p_Bs = {};
+    std::vector<std::array<const void*, NumDTensor>> p_Ds = {};
+    std::vector<void*> p_Cs                               = {};
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(p_As, p_Bs, p_Ds, p_Cs, gemm_descs);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    DeviceMem gemm_workspace_dev(gemm.GetWorkSpaceSize(&argument));
+    gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer());
+
+    DeviceMem gemm_kernel_args_dev(gemm.GetDeviceKernelArgSize(&argument));
+    hip_check_error(hipMemcpy(gemm_kernel_args_dev.GetDeviceBuffer(),
+                              grouped_gemm_kernel_args_.data(),
+                              gemm.GetDeviceKernelArgSize(&argument),
+                              hipMemcpyHostToDevice));
+
+    gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer());
+    gemm.SetKBatch(argument, config.k_batch);
+
+    gemm.SetElementwiseOps(argument, a_element_op, b_element_op, cde_element_op);
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    if(config.time_kernel)
+    {
+        float ave_time   = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B1DataType,
+                                                                                EDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            for(int n = 0; n < problem_size.Ns[i]; ++n)
+            {
+                for(int k = 0; k < problem_size.Ks[i]; ++k)
+                {
+                    b_element_op(b_tensors[i](k, n), b0_tensors[i](k, n), b1_tensors[i](k, n));
+                }
+            }
+
+            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(),
+                                            c_device_tensors[i].mDesc.GetElementSize() *
+                                                sizeof(EDataType));
+
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a0_tensors[i],
+                                                      b_tensors[i],
+                                                      c_host_tensors[i],
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+            ref_invoker.Run(ref_argument);
+
+            for(int m = 0; m < problem_size.Ms[i]; ++m)
+            {
+                for(int n = 0; n < problem_size.Ns[i]; ++n)
+                {
+                    cde_element_op(
+                        c_host_tensors[i](m, n), c_host_tensors[i](m, n), d0_tensors[i](m, n));
+                }
+            }
+
+            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
+        }
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ms.push_back(32 + rand() % 32);
+        problem_size.Ns.push_back(1024);
+        problem_size.Ks.push_back(512);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        exit(0);
+    }
+
+    return !run_grouped_gemm(problem_size, config);
+}
--- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp"
+#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+using AddScale = ck::tensor_operation::element_wise::BinaryWithUnaryCombinedOp<Add, Scale, Scale>;
+
+using A0DataType       = F16;
+using A1DataType       = F32;
+using AsDataType       = ck::Tuple<A0DataType, A1DataType>;
+using B0DataType       = F16;
+using BsDataType       = ck::Tuple<B0DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType        = F32;
+
+using A0Layout = Row;
+using A1Layout = Row;
+using AsLayout = ck::Tuple<A0Layout, A1Layout>;
+using B0Layout = Col;
+using BsLayout = ck::Tuple<B0Layout>;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+using ELayout  = Row;
+
+using AElementOp = AddScale;
+using BElementOp = PassThrough;
+
+using CDEElementOp = Add;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   128,    16,   128,    32,   8,   8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               1,  ck::half_t>;
+
+// clang-format on
+
+struct ProblemSize final
+{
+    std::vector<ck::index_t> Ms;
+    std::vector<ck::index_t> Ns;
+    std::vector<ck::index_t> Ks;
+
+    std::vector<ck::index_t> stride_As;
+    std::vector<ck::index_t> stride_Bs;
+    std::vector<ck::index_t> stride_Cs;
+
+    ck::index_t group_count;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int k_batch          = 1;
+};
+
+bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    auto group_count = problem_size.group_count;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmMultiABDDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    int sum_of_m = 0;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::vector<Tensor<A0DataType>> a0_tensors;
+    std::vector<Tensor<A1DataType>> a1_tensors;
+    std::vector<Tensor<B0DataType>> b_tensors;
+    std::vector<Tensor<D0DataType>> d0_tensors;
+    std::vector<Tensor<EDataType>> e_host_tensors;
+    std::vector<Tensor<EDataType>> e_device_tensors;
+
+    a0_tensors.reserve(group_count);
+    a1_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    d0_tensors.reserve(group_count);
+    e_host_tensors.reserve(group_count);
+    e_device_tensors.reserve(group_count);
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a0_tensors_device, a1_tensors_device, b_tensors_device,
+        d0_tensors_device, c_tensors_device;
+
+    a0_tensors_device.reserve(group_count);
+    a1_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    d0_tensors_device.reserve(group_count);
+    c_tensors_device.reserve(group_count);
+
+    std::size_t flop = 0, num_btype = 0;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        sum_of_m += problem_size.Ms[i];
+        a0_tensors.push_back(Tensor<A0DataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ks[i], problem_size.stride_As[i], A0Layout{})));
+        a1_tensors.push_back(Tensor<A1DataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ks[i], problem_size.stride_As[i], A1Layout{})));
+        b_tensors.push_back(Tensor<B0DataType>(f_host_tensor_descriptor(
+            problem_size.Ks[i], problem_size.Ns[i], problem_size.stride_Bs[i], B0Layout{})));
+        d0_tensors.push_back(Tensor<D0DataType>(
+            f_host_tensor_descriptor(problem_size.Ms[i], problem_size.Ns[i], 0, ELayout{})));
+        e_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+        e_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            problem_size.Ms[i], problem_size.Ns[i], problem_size.stride_Cs[i], ELayout{})));
+        std::cout << "gemm[" << i << "] a_m_k: " << a0_tensors[i].mDesc
+                  << " b_k_n: " << b_tensors[i].mDesc << " d_m_n: " << d0_tensors[i].mDesc
+                  << " c_m_n: " << e_device_tensors[i].mDesc << std::endl;
+
+        flop += std::size_t(2) * problem_size.Ms[i] * problem_size.Ks[i] * problem_size.Ns[i];
+        num_btype += sizeof(A0DataType) * a0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(A1DataType) * a1_tensors[i].mDesc.GetElementSize() +
+                     sizeof(B0DataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(D0DataType) * d0_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSize();
+
+        switch(config.init_method)
+        {
+        case 0: break;
+        case 1:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+            a1_tensors[i].GenerateTensorValue(GeneratorTensor_2<A1DataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+            break;
+        case 2:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+            a1_tensors[i].GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+            break;
+        default:
+            a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        }
+
+        d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    constexpr ck::index_t NumATensor = 2;
+    constexpr ck::index_t NumBTensor = 1;
+    constexpr ck::index_t NumDTensor = 1;
+
+    using GroupedGemmKernelArgument = ck::tensor_operation::device::
+        GroupedGemmMultiABDKernelArgument<NumATensor, NumBTensor, NumDTensor>;
+
+    std::vector<GroupedGemmKernelArgument> grouped_gemm_kernel_args_;
+    grouped_gemm_kernel_args_.reserve(group_count);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        a0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(A0DataType) * sum_of_m * problem_size.Ks[i]));
+
+        a1_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(A1DataType) * sum_of_m * problem_size.Ks[i]));
+
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(B0DataType) * problem_size.Ns[i] * problem_size.Ks[i]));
+
+        d0_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(D0DataType) * problem_size.Ns[i]));
+
+        c_tensors_device.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(EDataType) * sum_of_m * problem_size.Ns[i]));
+
+        a0_tensors_device[i]->ToDevice(a0_tensors[i].mData.data(),
+                                       a0_tensors[i].mDesc.GetElementSpaceSize() *
+                                           sizeof(A0DataType));
+
+        a1_tensors_device[i]->ToDevice(a1_tensors[i].mData.data(),
+                                       a1_tensors[i].mDesc.GetElementSpaceSize() *
+                                           sizeof(A1DataType));
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data(),
+                                      b_tensors[i].mDesc.GetElementSpaceSize() *
+                                          sizeof(B0DataType));
+        d0_tensors_device[i]->ToDevice(d0_tensors[i].mData.data());
+        c_tensors_device[i]->SetZero();
+
+        gemm_descs.push_back({sum_of_m,
+                              problem_size.Ns[i],
+                              problem_size.Ks[i],
+                              {1, 1},
+                              {problem_size.stride_Bs[i]},
+                              {0},
+                              1});
+
+        grouped_gemm_kernel_args_.push_back(
+            {std::array<const void*, NumATensor>{a0_tensors_device[i]->GetDeviceBuffer(),
+                                                 a1_tensors_device[i]->GetDeviceBuffer()},
+             std::array<const void*, NumBTensor>{b_tensors_device[i]->GetDeviceBuffer()},
+             std::array<const void*, NumDTensor>{d0_tensors_device[i]->GetDeviceBuffer()},
+             c_tensors_device[i]->GetDeviceBuffer(),
+             problem_size.Ms[i],
+             problem_size.Ns[i],
+             problem_size.Ks[i],
+             std::array<ck::index_t, NumATensor>{problem_size.stride_As[i],
+                                                 problem_size.stride_As[i]},
+             std::array<ck::index_t, NumBTensor>{problem_size.stride_Bs[i]},
+             std::array<ck::index_t, NumDTensor>{0},
+             problem_size.stride_Cs[i]});
+    }
+
+    constexpr float scale = 1.f;
+    auto a_element_op     = AElementOp{Add{}, Scale{scale}, Scale{scale}};
+    auto b_element_op     = BElementOp{};
+    auto cde_element_op   = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    std::vector<std::array<const void*, NumATensor>> p_As = {};
+    std::vector<std::array<const void*, NumBTensor>> p_Bs = {};
+    std::vector<std::array<const void*, NumDTensor>> p_Ds = {};
+    std::vector<void*> p_Cs                               = {};
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(p_As, p_Bs, p_Ds, p_Cs, gemm_descs);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    DeviceMem gemm_workspace_dev(gemm.GetWorkSpaceSize(&argument));
+    gemm.SetWorkSpacePointer(&argument, gemm_workspace_dev.GetDeviceBuffer());
+
+    DeviceMem gemm_kernel_args_dev(gemm.GetDeviceKernelArgSize(&argument));
+    hip_check_error(hipMemcpy(gemm_kernel_args_dev.GetDeviceBuffer(),
+                              grouped_gemm_kernel_args_.data(),
+                              gemm.GetDeviceKernelArgSize(&argument),
+                              hipMemcpyHostToDevice));
+
+    gemm.SetDeviceKernelArgs(argument, gemm_kernel_args_dev.GetDeviceBuffer());
+    gemm.SetKBatch(argument, config.k_batch);
+
+    gemm.SetElementwiseOps(argument, a_element_op, b_element_op, cde_element_op);
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    if(config.time_kernel)
+    {
+        float ave_time   = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                EDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            for(int m = 0; m < problem_size.Ms[i]; ++m)
+            {
+                for(int k = 0; k < problem_size.Ks[i]; ++k)
+                {
+                    a_element_op(a0_tensors[i](m, k), a0_tensors[i](m, k), a1_tensors[i](m, k));
+                }
+            }
+
+            c_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data(),
+                                            e_device_tensors[i].mDesc.GetElementSize() *
+                                                sizeof(EDataType));
+
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a0_tensors[i],
+                                                      b_tensors[i],
+                                                      e_host_tensors[i],
+                                                      PassThrough{},
+                                                      b_element_op,
+                                                      PassThrough{});
+
+            ref_invoker.Run(ref_argument);
+
+            for(int m = 0; m < problem_size.Ms[i]; ++m)
+            {
+                for(int n = 0; n < problem_size.Ns[i]; ++n)
+                {
+                    cde_element_op(
+                        e_host_tensors[i](m, n), e_host_tensors[i](m, n), d0_tensors[i](m, n));
+                }
+            }
+
+            pass &= ck::utils::check_err(e_device_tensors[i], e_host_tensors[i]);
+        }
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ms.push_back(32 + rand() % 32);
+        problem_size.Ns.push_back(1024);
+        problem_size.Ks.push_back(512);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.k_batch         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: k_batch (>0)\n");
+        exit(0);
+    }
+
+    return !run_grouped_gemm(problem_size, config);
+}
--- a/example/60_gemm_multi_ABD/CMakeLists.txt
+++ b/example/60_gemm_multi_ABD/CMakeLists.txt
 add_example_executable(example_gemm_multi_ABD_xdl_fp16 gemm_multi_ABD_xdl_fp16.cpp)
+add_example_executable(example_gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp)
+add_example_executable(example_gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp)
+add_example_executable(example_gemm_multi_ABD_xdl_fastgelu_bf16_i8 gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp)
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using D0DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<D0Layout>;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = Multiply;
+using CDEElementOp = AddFastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(D0DataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 2;
+    constexpr ck::index_t NumDTensor = 1;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
+                                                                   b1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB, 0},
+                               std::array<ck::index_t, NumDTensor>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<A0DataType> a_m_k({M, K});
+
+        Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B1DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType, B1DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout, B1Layout>;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using Multiply    = ck::tensor_operation::element_wise::Multiply;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = Multiply;
+using CDEElementOp = FastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(D0DataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 2;
+    constexpr ck::index_t NumDTensor = 0;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
+                                                                   b1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB, 0},
+                               std::array<ck::index_t, NumDTensor>{},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<A0DataType> a_m_k({M, K});
+
+        Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B1DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
@@ -37,7 +37,7 @@ using DDataType        = F16;
 using EDataType        = F16;

 using ALayout = Row;
-using BLayout = Col;
+using BLayout = Row;
 using DLayout = Row;
 using ELayout = Row;

@@ -141,9 +141,9 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl
    S<4, 64, 1>,
    S<1, 0, 2>,
    S<1, 0, 2>,
+    1,
    2,
    8,
-    8,
    1,
    1,
    1,
@@ -161,10 +161,10 @@ int main(int argc, char* argv[])
    ck::index_t N = 4096;
    ck::index_t K = 4096;

-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideD = 4096;
-    ck::index_t StrideE = 4096;
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = N;
+    ck::index_t StrideE = N;

    float alpha = 1.0f;
    float beta  = 1.0f;

--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = BF16;
+using AsDataType       = ck::Tuple<A0DataType>;
+using B0DataType       = I8;
+using B1DataType       = BF16;
+using BsDataType       = ck::Tuple<B0DataType>;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using DsDataType       = ck::Tuple<B1DataType, D0DataType>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using AsLayout = ck::Tuple<A0Layout>;
+using B0Layout = Row;
+using B1Layout = B0Layout;
+using BsLayout = ck::Tuple<B0Layout>;
+using D0Layout = Row;
+using DsLayout = ck::Tuple<B1Layout, D0Layout>;
+using ELayout  = Row;
+
+using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
+using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyAddFastGelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < AsLayout, BsLayout, DsLayout, ELayout, AsDataType, BsDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,               S<1, 32, 1, 8>,               8,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v4>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 4096;
+    ck::index_t N = 768;
+    ck::index_t K = 6144;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = N;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
+    Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(D0DataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumATensor = 1;
+    constexpr ck::index_t NumBTensor = 1;
+    constexpr ck::index_t NumDTensor = 2;
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, NumDTensor>{b1_device_buf.GetDeviceBuffer(),
+                                                                   d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, NumATensor>{StrideA},
+                               std::array<ck::index_t, NumBTensor>{StrideB},
+                               std::array<ck::index_t, NumDTensor>{0, StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<A0DataType> a_m_k({M, K});
+
+        Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+
+#if 0
+        for(int n = 0; n < N; ++n)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
+            }
+        }
+#endif
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(0, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}