Merge branch 'develop' into wavelet_model

b89a88b5 · Adam Osewski · 41d5fca7 · 43c898f6 · b89a88b5 · b89a88b5
Commit b89a88b5 authored Sep 19, 2022 by Adam Osewski
20 changed files
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;
+
+template <typename ALay, typename BLay, typename DELay, typename RLay>
+struct LayoutSetting
+{
+    using ALayout  = ALay;
+    using BLayout  = BLay;
+    using DELayout = DELay;
+    using RLayout  = RLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct LayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct LayoutSettingSelector<1> final : LayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK, ctl::GNW>
+{
+};
+
+template <>
+struct LayoutSettingSelector<2> final : LayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK, ctl::GNHW>
+{
+};
+
+template <>
+struct LayoutSettingSelector<3> final
+    : LayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK, ctl::GNDHW>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using ALayout = typename LayoutSettingSelector<NDimSpatial>::ALayout;
+
+template <ck::index_t NDimSpatial>
+using BLayout = typename LayoutSettingSelector<NDimSpatial>::BLayout;
+
+template <ck::index_t NDimSpatial>
+using DELayout = typename LayoutSettingSelector<NDimSpatial>::DELayout;
+
+template <ck::index_t NDimSpatial>
+using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ck::utils::conv::ConvParam& problem_size,
+                           ExecutionConfig& config)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        problem_size                      = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
+
+inline HostTensorDescriptor
+make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
+{
+    std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
+
+    std::copy(begin(problem_size.output_spatial_lengths_),
+              end(problem_size.output_spatial_lengths_),
+              std::back_inserter(dimensions));
+
+    return HostTensorDescriptor(dimensions);
+}
+
+template <typename Lengths, typename Strides>
+void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
+                                   Lengths& lengths,
+                                   Strides& strides)
+{
+    assert(size(descriptor.GetLengths()) == size(lengths));
+    std::copy_n(begin(descriptor.GetLengths()), size(descriptor.GetLengths()), begin(lengths));
+
+    assert(size(descriptor.GetStrides()) == size(strides));
+    std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
+}
+
+template <typename Range, typename OutputIterator>
+auto copy(const Range& range, OutputIterator iter)
+    -> decltype(std::copy(std::begin(range), std::end(range), iter))
+{
+    return std::copy(std::begin(range), std::end(range), iter);
+}
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = BF16;
+using BDataType         = BF16;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = BF16;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = FP16;
+using BDataType         = FP16;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = FP16;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = FP32;
+using BDataType         = FP32;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = FP32;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#define BUILD_INT4_EXAMPLE
+
+#include "common.hpp"
+
+using ADataType         = I4;
+using BDataType         = I4;
+using KernelADataType   = I8;
+using KernelBDataType   = I8;
+using AccDataType       = I32;
+using CShuffleDataType  = I32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = I32;
+using ReduceAccDataType = I32;
+using R0DataType        = I32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = I8;
+using BDataType         = I8;
+using AccDataType       = I32;
+using CShuffleDataType  = I32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = I32;
+using ReduceAccDataType = I32;
+using R0DataType        = I32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+template <ck::index_t NDimSpatial>
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
+//######| NDimSpatial|              ALayout|              BLayout|              DELayout|              RLayout|           AData|           BData|     AccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           Conv|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|            |                     |                     |                      |                     |            Type|            Type|        Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce|            Fwd|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|            |                     |                     |                      |                     |                |                |            |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation| Specialization|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|            |                     |                     |                      |                     |                |                |            |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+#ifdef BUILD_INT4_EXAMPLE
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+#else
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+#endif
+
+template <ck::index_t NDimSpatial>
+using HostInstance = ck::tensor_operation::host::ReferenceConvFwd
+        <NDimSpatial, ADataType, BDataType, EDataType, AElementOp, BElementOp, PassThrough>;
+// clang-format on
+
+template <ck::index_t NDimSpatial>
+bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
+                        const ExecutionConfig& config)
+{
+    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+    const auto conv_input_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ALayout<NDimSpatial>>(
+            problem_size);
+
+    const auto conv_weight_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<BLayout<NDimSpatial>>(
+            problem_size);
+
+    const auto conv_output_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<DELayout<NDimSpatial>>(
+            problem_size);
+
+    const auto r0_desc = make_r0_host_tensor_descriptor(problem_size);
+
+    Tensor<ADataType> conv_input(conv_input_g_n_c_wis_desc);
+    Tensor<BDataType> conv_weight(conv_weight_g_k_c_xs_desc);
+    Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
+    Tensor<R0DataType> r0_device(r0_desc);
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
+                                                                         conv_input.end());
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
+                                                                         conv_weight.end());
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
+        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
+                                                             conv_weight.end());
+    }
+
+    DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
+    DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
+    DeviceMem conv_output_device_buf(sizeof(EDataType) *
+                                     conv_output_device.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> conv_input_converted(conv_input);
+    const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
+
+    conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
+    conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
+#else
+    conv_input_device_buf.ToDevice(conv_input.mData.data());
+    conv_weight_device_buf.ToDevice(conv_weight.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
+        conv_input_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> conv_weight_g_k_c_xs_lengths{},
+        conv_weight_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
+        conv_output_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
+
+    unpack_host_tensor_descriptor(
+        conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
+    unpack_host_tensor_descriptor(
+        conv_weight_g_k_c_xs_desc, conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_strides);
+    unpack_host_tensor_descriptor(
+        conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
+    unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
+
+    copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
+    copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
+    copy(problem_size.input_left_pads_, begin(input_left_pads));
+    copy(problem_size.input_right_pads_, begin(input_right_pads));
+
+    // run Conv + Reduction on device
+    auto conv     = DeviceInstance<NDimSpatial>{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
+                                      conv_weight_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      conv_output_device_buf.GetDeviceBuffer(),
+                                      {r0_device_buf.GetDeviceBuffer()},
+                                      conv_input_g_n_c_wis_lengths,
+                                      conv_input_g_n_c_wis_strides,
+                                      conv_weight_g_k_c_xs_lengths,
+                                      conv_weight_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      conv_output_g_n_k_wos_lengths,
+                                      conv_output_g_n_k_wos_strides,
+                                      r0_lengths,
+                                      r0_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      AElementOp{},
+                                      BElementOp{},
+                                      CDEElementOp{},
+                                      QsElementOp{},
+                                      RsElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return false;
+    }
+
+    const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    const std::size_t flop      = problem_size.GetFlops();
+    const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
+
+    const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    const float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<EDataType> conv_output_host(conv_output_g_n_k_wos_desc);
+
+        // run Conv + Reduction on host
+        auto ref_conv     = HostInstance<NDimSpatial>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(conv_input,
+                                                  conv_weight,
+                                                  conv_output_host,
+                                                  problem_size.conv_filter_strides_,
+                                                  problem_size.conv_filter_dilations_,
+                                                  problem_size.input_left_pads_,
+                                                  problem_size.input_right_pads_,
+                                                  AElementOp{},
+                                                  BElementOp{},
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        Tensor<R0DataType> r0_host(r0_device.mDesc);
+
+        auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
+
+        auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
+
+        if constexpr(NDimSpatial == 1)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t w = 0; w < output_dims[3]; ++w)
+                    {
+                        auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                        for(std::size_t k = 0; k < output_dims[2]; ++k)
+                        {
+
+                            auto e_val =
+                                ck::type_convert<ReduceAccDataType>(conv_output_host(g, n, k, w));
+                            reduce0_op(reduce0_acc, e_val);
+                        }
+                        r0_host(g, n, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                    }
+                }
+            }
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t h = 0; h < output_dims[3]; ++h)
+                    {
+                        for(std::size_t w = 0; w < output_dims[4]; ++w)
+                        {
+                            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                            for(std::size_t k = 0; k < output_dims[2]; ++k)
+                            {
+
+                                auto e_val = ck::type_convert<ReduceAccDataType>(
+                                    conv_output_host(g, n, k, h, w));
+                                reduce0_op(reduce0_acc, e_val);
+                            }
+                            r0_host(g, n, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                        }
+                    }
+                }
+            }
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t d = 0; d < output_dims[3]; ++d)
+                    {
+                        for(std::size_t h = 0; h < output_dims[4]; ++h)
+                        {
+                            for(std::size_t w = 0; w < output_dims[5]; ++w)
+                            {
+                                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                                for(std::size_t k = 0; k < output_dims[2]; ++k)
+                                {
+
+                                    auto e_val = ck::type_convert<ReduceAccDataType>(
+                                        conv_output_host(g, n, k, d, h, w));
+                                    reduce0_op(reduce0_acc, e_val);
+                                }
+                                r0_host(g, n, d, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        conv_output_device_buf.FromDevice(conv_output_device.mData.data());
+        r0_device_buf.FromDevice(r0_device.mData.data());
+
+        return ck::utils::check_err(conv_output_device.mData,
+                                    conv_output_host.mData,
+                                    "Error: incorrect results! (Matrix E)",
+                                    1e-5f,
+                                    1e-4f) &&
+               ck::utils::check_err(r0_device.mData,
+                                    r0_host.mData,
+                                    "Error: incorrect results! (Matrix R0)",
+                                    1e-5f,
+                                    1e-4f);
+    }
+
+    return true;
+}
+
+bool run_convnd_fwd_max_example(int argc, char* argv[])
+{
+    ck::utils::conv::ConvParam problem_size{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    ExecutionConfig config;
+
+    if(!parse_cmd_args(argc, argv, problem_size, config))
+    {
+        return false;
+    }
+
+    switch(problem_size.num_dim_spatial_)
+    {
+    case 1: return run_convnd_fwd_max<1>(problem_size, config);
+    case 2: return run_convnd_fwd_max<2>(problem_size, config);
+    case 3: return run_convnd_fwd_max<3>(problem_size, config);
+    }
+
+    return false;
+}
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -225,6 +225,28 @@ int main(int argc, char* argv[])
                arg.scales[0],
                arg.scales[1]);
        }
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        else if(arg.data_type == 7)
+        {
+            pass = reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+
+            pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                               arg.do_verification,
+                               arg.init_method,
+                               arg.time_kernel,
+                               arg.inLengths,
+                               arg.reduceDims,
+                               arg.scales[0],
+                               arg.scales[1]);
+        }
+#endif
    }
    else
    {
@@ -251,6 +273,15 @@ int main(int argc, char* argv[])
            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        // for testing int4_t using AVG operation
+        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing int4_t using MAX operation
+        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+#endif
        // for testing 3D input
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);

--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -58,28 +58,47 @@ int reduce_blockwise_impl(bool do_verification,
        std::is_same<InOutDataType, float>::value &&
        (op_support_indices && !std::is_same<AccDataType, float>::value);

-    // 1) If InOutDataType is int8_t, must use int8_t as AccDataType for indexable reduction
-    // operations 2) If InOutDataType is int8_t, must use int32_t as AccDataType for non-indexable
-    // reduction operations
+    // 1) If InOutDataType is int8_t or int4_t, must use int8_t as AccDataType for indexable
+    // reduction operations 2) If InOutDataType is int8_t or int4_t, must use int32_t as AccDataType
+    // for non-indexable reduction operations
    constexpr bool invalid_reduce_4 =
        std::is_same<InOutDataType, int8_t>::value &&
        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));

-    // 1) If InOutDataType is int8_t, the supported operation must be either indexable operations or
-    // ADD/AVG
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_4_2 =
+        std::is_same<InOutDataType, int4_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+#endif
+
+    // 1) If InOutDataType is int8_t or int4_t, the supported operation must be either indexable
+    // operations or ADD/AVG
    constexpr bool invalid_reduce_5 = std::is_same<InOutDataType, int8_t>::value &&
                                      (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
                                       ReduceOpId != ReduceTensorOp::AVG);

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_5_2 = std::is_same<InOutDataType, int4_t>::value &&
+                                        (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                         ReduceOpId != ReduceTensorOp::AVG);
+#endif
+
    // 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
    constexpr bool invalid_reduce_6 =
        std::is_same<InOutDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce =
+        (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 ||
+         invalid_reduce_5 || invalid_reduce_6 || invalid_reduce_4_2 || invalid_reduce_5_2);
+#else
    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
+#endif

-    if(invalid_reduce)
+    if constexpr(invalid_reduce)
    {
        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
        return (-1);
@@ -91,10 +110,17 @@ int reduce_blockwise_impl(bool do_verification,
    using AccElementwiseOperation =
        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;

+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    using InOutDataTypeInDevice = typename std::
+        conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
+#else
+    using InOutDataTypeInDevice   = InOutDataType;
+#endif
+
    using DeviceReduceInstance =
-        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataTypeInDevice,
                                                             AccDataType,
-                                                             InOutDataType,
+                                                             InOutDataTypeInDevice,
                                                             Rank,
                                                             NumReduceDim,
                                                             ReduceOperation,
@@ -166,13 +192,35 @@ int reduce_blockwise_impl(bool do_verification,
    };

    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());

-    in_dev.ToDevice(in.mData.data());
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if(std::is_same<InOutDataType, int4_t>::value)
+    {
+        std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+        std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data());
+        in_dev.ToDevice(tmp_buf.data());
+    }
+    else
+#endif
+        in_dev.ToDevice(in.mData.data());

    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
+    {
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+            std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data());
+            out_dev.ToDevice(tmp_buf.data());
+        }
+        else
+#endif
+            out_dev.ToDevice(out.mData.data());
+    };

    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;

@@ -261,7 +309,19 @@ int reduce_blockwise_impl(bool do_verification,

    if(do_verification)
    {
-        out_dev.FromDevice(out.mData.data());
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size());
+
+            out_dev.FromDevice(tmp_buf.data());
+
+            std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data());
+        }
+        else
+#endif
+            out_dev.FromDevice(out.mData.data());
+
        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);

        if(OutputIndex)

--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
+add_custom_target(example_grouped_gemm_xdl)
+
+add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
 add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
+
+add_dependencies(example_grouped_gemm_xdl
+                 example_grouped_gemm_xdl_fp32
+                 example_grouped_gemm_xdl_fp16
+                 example_grouped_gemm_xdl_bfp16
+                 example_grouped_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
+endif()
--- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp