first commit

4cccaba1 · Yang0001 · 4cccaba1 · 4cccaba1 · 4cccaba1 · 4cccaba1
Commit 4cccaba1 authored Jun 07, 2023 by Yang0001
20 changed files
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::bhalf_t;
+using Wei0DataType      = ck::bhalf_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::bhalf_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::bhalf_t;
+
+// This is used for reference code
+using Out0DataType = ck::bhalf_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::half_t;
+using Wei0DataType      = ck::half_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::half_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::half_t;
+
+// This is used for reference code
+using Out0DataType = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = float;
+using Wei0DataType      = float;
+using Acc0DataType      = float;
+using Wei1DataType      = float;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = float;
+
+// This is used for reference code
+using Out0DataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        16,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        16,          // Gemm1KPerBlock
+        4,           // AK1
+        4,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        2,
+        2,
+        true,
+        1,               // CShuffleMXdlPerWavePerShuffle
+        2,               // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4>;              // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType        = ck::int4_t;
+using Wei0DataType       = ck::int4_t;
+using KernelIn0DataType  = int8_t;
+using KernelWei0DataType = int8_t;
+using Acc0DataType       = int32_t;
+using Wei1DataType       = ck::int4_t;
+using KernelWei1DataType = int8_t;
+using Acc1DataType       = int32_t;
+using C1ShuffleDataType  = int32_t;
+using Out1DataType       = ck::int4_t;
+using KernelOut1DataType = int8_t;
+
+// This is used for reference code
+using Out0DataType = ck::int4_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,                // ALayout
+        Col,                // B0Layout
+        Col,                // B1Layout
+        Row,                // CLayout
+        KernelIn0DataType,  // ADataType,
+        KernelWei0DataType, // B0DataType,
+        KernelWei1DataType, // B1DataType,
+        KernelOut1DataType, // CDataType,
+        Acc0DataType,       // AccDataType,
+        C1ShuffleDataType,  // CShuffleDataType,
+        In0ElementOp,       // AElementOp,
+        Wei0ElementOp,      // B0ElementOp,
+        Out0ElementOp,      // Acc0ElementOp,
+        Wei1ElementOp,      // B1ElementOp,
+        Out1ElementOp,      // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#define BUILD_INT4_EXAMPLE
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = int8_t;
+using Wei0DataType      = int8_t;
+using Acc0DataType      = int32_t;
+using Wei1DataType      = int8_t;
+using Acc1DataType      = int32_t;
+using C1ShuffleDataType = int32_t;
+using Out1DataType      = int8_t;
+
+// This is used for reference code
+using Out0DataType = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <ck::index_t NDimSpatial,
+          typename In0DataType,
+          typename Wei0DataType,
+          typename Out0DataType,
+          typename Wei1DataType,
+          typename Out1DataType,
+          typename In0ElementOp,
+          typename Wei0ElementOp,
+          typename Out0ElementOp,
+          typename Wei1ElementOp,
+          typename Out1ElementOp,
+          typename DeviceOpInstance>
+bool run_grouped_conv_conv_fwd(bool do_verification,
+                               int init_method,
+                               bool time_kernel,
+                               const ck::utils::conv::ConvParam& conv0_param,
+                               const ck::utils::conv::ConvParam& conv1_param,
+                               const HostTensorDescriptor& in0_g_n_c_wis_desc,
+                               const HostTensorDescriptor& wei0_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out0_g_n_k_wos_desc,
+                               const HostTensorDescriptor& wei1_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out1_g_n_k_wos_desc,
+                               const In0ElementOp& in0_element_op,
+                               const Wei0ElementOp& wei0_element_op,
+                               const Wei1ElementOp& wei1_element_op,
+                               const Out0ElementOp& out0_element_op,
+                               const Out1ElementOp& out1_element_op)
+{
+    Tensor<In0DataType> in0(in0_g_n_c_wis_desc);
+    Tensor<Wei0DataType> wei0(wei0_g_k_c_xs_desc);
+    Tensor<Wei1DataType> wei1(wei1_g_k_c_xs_desc);
+    Tensor<Out1DataType> out1_host(out1_g_n_k_wos_desc);
+    Tensor<Out1DataType> out1_device(out1_g_n_k_wos_desc);
+
+    std::cout << "in0: " << in0.mDesc << std::endl;
+    std::cout << "wei0: " << wei0.mDesc << std::endl;
+    std::cout << "wei1: " << wei1.mDesc << std::endl;
+    std::cout << "out1: " << out1_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in0.GenerateTensorValue(GeneratorTensor_2<In0DataType>{-5, 5});
+        wei0.GenerateTensorValue(GeneratorTensor_2<Wei0DataType>{-5, 5});
+        wei1.GenerateTensorValue(GeneratorTensor_2<Wei1DataType>{-5, 5});
+        break;
+    default:
+        in0.GenerateTensorValue(GeneratorTensor_3<In0DataType>{0.0, 1.0});
+        wei0.GenerateTensorValue(GeneratorTensor_3<Wei0DataType>{-0.5, 0.5});
+        wei1.GenerateTensorValue(GeneratorTensor_3<Wei1DataType>{-0.5, 0.5});
+    }
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem in0_device_buf(sizeof(KernelIn0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(KernelWei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(KernelWei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(KernelOut1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelIn0DataType> in0_converted(in0);
+    const Tensor<KernelWei0DataType> wei0_converted(wei0);
+    const Tensor<KernelWei1DataType> wei1_converted(wei1);
+
+    in0_device_buf.ToDevice(in0_converted.mData.data());
+    wei0_device_buf.ToDevice(wei0_converted.mData.data());
+    wei1_device_buf.ToDevice(wei1_converted.mData.data());
+#else
+    DeviceMem in0_device_buf(sizeof(In0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(Wei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(Wei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(Out1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    in0_device_buf.ToDevice(in0.mData.data());
+    wei0_device_buf.ToDevice(wei0.mData.data());
+    wei1_device_buf.ToDevice(wei1.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input0_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input0_right_pads{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input1_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input1_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in0_g_n_c_wis_desc.GetLengths(), a0_g_n_c_wis_lengths);
+    copy(in0_g_n_c_wis_desc.GetStrides(), a0_g_n_c_wis_strides);
+    copy(wei0_g_k_c_xs_desc.GetLengths(), b0_g_k_c_xs_lengths);
+    copy(wei0_g_k_c_xs_desc.GetStrides(), b0_g_k_c_xs_strides);
+    copy(wei1_g_k_c_xs_desc.GetLengths(), b1_g_k_c_xs_lengths);
+    copy(wei1_g_k_c_xs_desc.GetStrides(), b1_g_k_c_xs_strides);
+    copy(out1_g_n_k_wos_desc.GetLengths(), e1_g_n_k_wos_lengths);
+    copy(out1_g_n_k_wos_desc.GetStrides(), e1_g_n_k_wos_strides);
+    copy(conv0_param.conv_filter_strides_, conv0_filter_strides);
+    copy(conv0_param.conv_filter_dilations_, conv0_filter_dilations);
+    copy(conv0_param.input_left_pads_, input0_left_pads);
+    copy(conv0_param.input_right_pads_, input0_right_pads);
+    copy(conv1_param.conv_filter_strides_, conv1_filter_strides);
+    copy(conv1_param.conv_filter_dilations_, conv1_filter_dilations);
+    copy(conv1_param.input_left_pads_, input1_left_pads);
+    copy(conv1_param.input_right_pads_, input1_right_pads);
+
+    // do Conv using GEMM, only works for 1x1 conv for now
+    const ck::index_t gemm_batch = a0_g_n_c_wis_lengths[0];
+
+    const ck::index_t gemm0_m_length =
+        e1_g_n_k_wos_lengths[1] *
+        ck::accumulate_n<ck::index_t>(
+            e1_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>{});
+
+    const ck::index_t gemm0_n_length = b0_g_k_c_xs_lengths[1];
+
+    const ck::index_t gemm0_k_length = ck::accumulate_n<ck::index_t>(
+        b0_g_k_c_xs_lengths.begin() + 2, NDimSpatial + 1, 1, std::multiplies<>{});
+
+    const ck::index_t gemm1_n_length = b1_g_k_c_xs_lengths[1];
+
+    //
+    const ck::index_t a0_stride = a0_g_n_c_wis_strides[2 + NDimSpatial];
+    const ck::index_t b0_stride = b0_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t b1_stride = b1_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t e1_stride = e1_g_n_k_wos_strides[2 + NDimSpatial];
+
+    //
+    const ck::index_t a0_batch_stride = a0_g_n_c_wis_strides[0];
+    const ck::index_t b0_batch_stride = b0_g_k_c_xs_strides[0];
+    const ck::index_t b1_batch_stride = b1_g_k_c_xs_strides[0];
+    const ck::index_t e1_batch_stride = e1_g_n_k_wos_strides[0];
+
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelIn0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<KernelOut1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<In0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<Out1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#endif
+        gemm0_m_length,
+        gemm0_n_length,
+        gemm0_k_length,
+        gemm1_n_length,
+        gemm_batch,
+        a0_stride,
+        b0_stride,
+        b1_stride,
+        e1_stride,
+        a0_batch_stride,
+        b0_batch_stride,
+        b1_batch_stride,
+        e1_batch_stride,
+        in0_element_op,
+        wei0_element_op,
+        out0_element_op,
+        wei1_element_op,
+        out1_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv0_param.GetFlops() + conv1_param.GetFlops();
+    std::size_t num_btype = conv0_param.template GetInputByte<In0DataType>() +
+                            conv0_param.template GetWeightByte<Wei0DataType>() +
+                            conv1_param.template GetWeightByte<Wei1DataType>() +
+                            conv1_param.template GetOutputByte<Out1DataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+        Tensor<Out0DataType> out0_host(out0_g_n_k_wos_desc);
+
+        auto ref_conv0 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      In0DataType,
+                                                                      Wei0DataType,
+                                                                      Out0DataType,
+                                                                      In0ElementOp,
+                                                                      Wei0ElementOp,
+                                                                      Out0ElementOp>();
+
+        auto ref_conv1 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      Out0DataType,
+                                                                      Wei1DataType,
+                                                                      Out1DataType,
+                                                                      PassThrough,
+                                                                      Wei1ElementOp,
+                                                                      Out1ElementOp>();
+
+        auto ref_conv0_invoker = ref_conv0.MakeInvoker();
+        auto ref_conv1_invoker = ref_conv1.MakeInvoker();
+
+        auto ref_conv0_argument = ref_conv0.MakeArgument(in0,
+                                                         wei0,
+                                                         out0_host,
+                                                         conv0_param.conv_filter_strides_,
+                                                         conv0_param.conv_filter_dilations_,
+                                                         conv0_param.input_left_pads_,
+                                                         conv0_param.input_right_pads_,
+                                                         in0_element_op,
+                                                         wei0_element_op,
+                                                         out0_element_op);
+
+        auto ref_conv1_argument = ref_conv1.MakeArgument(out0_host,
+                                                         wei1,
+                                                         out1_host,
+                                                         conv1_param.conv_filter_strides_,
+                                                         conv1_param.conv_filter_dilations_,
+                                                         conv1_param.input_left_pads_,
+                                                         conv1_param.input_right_pads_,
+                                                         out0_element_op,
+                                                         wei1_element_op,
+                                                         out1_element_op);
+
+        ref_conv0_invoker.Run(ref_conv0_argument);
+        ref_conv1_invoker.Run(ref_conv1_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<KernelOut1DataType> out1_device_converted(out1_host.mDesc);
+
+        out1_device_buf.FromDevice(out1_device_converted.mData.data());
+
+        out1_device = out1_device_converted.CopyAsType<Out1DataType>();
+#else
+        out1_device_buf.FromDevice(out1_device.mData.data());
+#endif
+
+        return ck::utils::check_err(
+            out1_device, out1_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_conv_fwd_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv0_param{
+        2, 1, 128, 512, 128, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    ck::utils::conv::ConvParam conv1_param{
+        2, 1, 128, 128, 512, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    const auto in0_element_op  = In0ElementOp{};
+    const auto wei0_element_op = Wei0ElementOp{};
+    const auto wei1_element_op = Wei1ElementOp{};
+    const auto out0_element_op = Out0ElementOp{};
+    const auto out1_element_op = Out1ElementOp{};
+
+    const auto run = [&](auto ndim_spatial,
+                         auto in0_layout,
+                         auto wei0_layout,
+                         auto wei1_layout,
+                         auto out1_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+
+        using In0Layout  = decltype(in0_layout);
+        using Wei0Layout = decltype(wei0_layout);
+        using Wei1Layout = decltype(wei1_layout);
+        using Out1Layout = decltype(out1_layout);
+
+        const auto in0_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<In0Layout>(
+                conv0_param);
+
+        const auto wei0_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei0Layout>(
+                conv0_param);
+
+        // out0 doesn't physical exist, any layout for host verification is OK
+        const auto out0_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv0_param);
+
+        const auto wei1_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei1Layout>(
+                conv1_param);
+
+        const auto out1_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv1_param);
+
+        return run_grouped_conv_conv_fwd<ndim_spatial_value,
+                                         In0DataType,
+                                         Wei0DataType,
+                                         Out0DataType,
+                                         Wei1DataType,
+                                         Out1DataType,
+                                         In0ElementOp,
+                                         Wei0ElementOp,
+                                         Out0ElementOp,
+                                         Wei1ElementOp,
+                                         Out1ElementOp,
+                                         DeviceBatchedGemmGemmInstance>(do_verification,
+                                                                        init_method,
+                                                                        time_kernel,
+                                                                        conv0_param,
+                                                                        conv1_param,
+                                                                        in0_g_n_c_wis_desc,
+                                                                        wei0_g_k_c_xs_desc,
+                                                                        out0_g_n_k_wos_desc,
+                                                                        wei1_g_k_c_xs_desc,
+                                                                        out1_g_n_k_wos_desc,
+                                                                        in0_element_op,
+                                                                        wei0_element_op,
+                                                                        wei1_element_op,
+                                                                        out0_element_op,
+                                                                        out1_element_op);
+    };
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if(conv0_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+
+    return true;
+}
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
+add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          AccDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 2;
+    ck::index_t H = 32;
+    ck::index_t W = 32;
+    ck::index_t G = 32;
+    ck::index_t C = 30;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<YDataType> y({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<BetaDataType> beta({G, C});
+
+    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x);
+    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma);
+    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta);
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    const auto y_element_op = YElementOp{};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, G, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, 0, 0, C, 1},
+        {0, 0, 0, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1, 2, 4}, // reduction dimension: [H, W, C]
+        1e-6,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
+        y_element_op);
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
+                            sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
+                            sizeof(BetaDataType) * G * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, "
+              << device_instance.GetTypeString() << std::endl;
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, G, C});
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 YElementOp>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
--- a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|CBlockTransferClusterLengths|   CBlockTransfer|
+        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|        _MBlock_MWaveMPerXdl|  ScalarPerVector|
+        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                            |                 |
+        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int split_k          = 1;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 64 * 2;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        split_k         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<BDataType> b_gs_ns_ks(
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_host_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_device_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
+                                    split_k);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
+                                    e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        e_gs_ms_ns_host_result.ForEach([&](auto&, auto idx) {
+            cde_element_op(e_gs_ms_ns_host_result(idx), c_ms_ns_host_result(idx), d_gs_ms_ns(idx));
+        });
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
--- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|CBlockTransferClusterLengths|   CBlockTransfer|
+        //############################################|        |        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|        _MBlock_MWaveMPerXdl|  ScalarPerVector|
+        //############################################|        |        |        |        |          |          |            |                 |           |          |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+        //############################################|        |        |        |        |          |          |            |                 |           |          |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                            |                 |
+        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   4,   4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,         1,           1,           1,              S<1, 32, 1, 4>,               4>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int split_k          = 1;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 64 * 2;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        split_k         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<BDataType> b_gs_ns_ks(
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_host_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_device_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
+                                    split_k);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
+                                    e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        e_gs_ms_ns_host_result.ForEach([&](auto&, auto idx) {
+            cde_element_op(e_gs_ms_ns_host_result(idx), c_ms_ns_host_result(idx), d_gs_ms_ns(idx));
+        });
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
--- a/example/44_conv2d_fwd_quantization/CMakeLists.txt
+++ b/example/44_conv2d_fwd_quantization/CMakeLists.txt
+add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
--- a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using AccDataType          = int32_t;
+using CShuffleDataType     = int32_t;
+using OutDataType          = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename RequantScaleLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& bias_g_k_desc,
+                          const HostTensorDescriptor& requant_scale_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_k_desc);
+    Tensor<RequantScaleDataType> requant_scale(requant_scale_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "requant_scale: " << requant_scale.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-128, 127});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-128, 127});
+    bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
+    requant_scale.GenerateTensorValue(GeneratorTensor_2<RequantScaleDataType>{0, 1});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem requant_scale_device_buf(sizeof(RequantScaleDataType) *
+                                       requant_scale.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    requant_scale_device_buf.ToDevice(requant_scale.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(requant_scale_g_k_desc.GetLengths(), d1_g_n_k_wos_lengths);
+    copy(requant_scale_g_k_desc.GetStrides(), d1_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        {bias_device_buf.GetDeviceBuffer(), requant_scale_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths},
+        {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        out_host.ForEach([&](auto&, auto idx) {
+            out_element_op(out_host(idx), c_host(idx), bias(idx), requant_scale(idx));
+        });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+
+    using InLayout           = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using BiasLayout         = ck::tensor_layout::convolution::G_K;
+    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    // TODO - make_bias_host_tensor_descriptor_g_n_k_wos_packed()
+    const auto bias_g_k_desc = HostTensorDescriptor({conv_param.G_,
+                                                     conv_param.N_,
+                                                     conv_param.K_,
+                                                     conv_param.output_spatial_lengths_[0],
+                                                     conv_param.output_spatial_lengths_[1]},
+                                                    {
+                                                        conv_param.K_, // g
+                                                        0,             // n
+                                                        1,             // k
+                                                        0,             // ho
+                                                        0              // wo
+                                                    });
+
+    const auto requant_scale_g_k_desc = bias_g_k_desc;
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::cout << out_g_n_k_wos_desc << std::endl;
+
+    using deviceOp = DeviceGroupedConvNDFwdInstance<ndim_spatial,
+                                                    InLayout,
+                                                    WeiLayout,
+                                                    BiasLayout,
+                                                    RequantScaleLayout,
+                                                    OutLayout>;
+
+    return run_grouped_conv_fwd<ndim_spatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                deviceOp>(do_verification,
+                                          time_kernel,
+                                          conv_param,
+                                          in_g_n_c_wis_desc,
+                                          wei_g_k_c_xs_desc,
+                                          bias_g_k_desc,
+                                          requant_scale_g_k_desc,
+                                          out_g_n_k_wos_desc,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+}
--- a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using BiasDataType     = int32_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& bias_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+    bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      {bias_device_buf.GetDeviceBuffer()},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {d0_g_n_k_wos_lengths},
+                                      {d0_g_n_k_wos_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+
+    using InLayout   = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using BiasLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    // TODO - make_bias_host_tensor_descriptor_g_n_k_wos_packed()
+    const auto bias_g_k_desc = HostTensorDescriptor({conv_param.G_,
+                                                     conv_param.N_,
+                                                     conv_param.K_,
+                                                     conv_param.output_spatial_lengths_[0],
+                                                     conv_param.output_spatial_lengths_[1]},
+                                                    {
+                                                        conv_param.K_, // g
+                                                        0,             // n
+                                                        1,             // k
+                                                        0,             // ho
+                                                        0              // wo
+                                                    });
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::cout << out_g_n_k_wos_desc << std::endl;
+
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        bias_g_k_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
--- a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+
+    using InLayout  = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
+add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType = F16;
+using BDataType = F16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+                                                    ck::Tuple<BDataType>,
+                                                    PassThrough,
+                                                    4,
+                                                    8,
+                                                    ck::Sequence<8>,
+                                                    ck::Sequence<1>>;
+
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
+{
+    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
+                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
+                {
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor(B_nhwc(n, h, w, c), a_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    std::vector<std::size_t> nhwc = {16, 32, 64, 128};
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths;
+    std::array<ck::index_t, 4> a_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[3]),
+                                            1};
+    std::array<ck::index_t, 4> b_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
+                                            1,
+                                            static_cast<int>(nhwc[2] * nhwc[3]),
+                                            static_cast<int>(nhwc[3])};
+
+    ck::ranges::copy(nchw, ab_lengths.begin());
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D(host_b, a, PassThrough{});
+
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_2d.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+
+using ADataType = F16;
+using BDataType = F16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+                                                    ck::Tuple<BDataType>,
+                                                    PassThrough,
+                                                    3, // NumDim_M
+                                                    1, // NumDim_N
+                                                    8,
+                                                    8,
+                                                    ck::Sequence<8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        const std::vector<std::size_t>& shape_nchw,
+                        Functor functor)
+{
+    for(std::size_t n = 0; n < shape_nchw[0]; ++n)
+        for(std::size_t c = 0; c < shape_nchw[1]; ++c)
+            for(std::size_t h = 0; h < shape_nchw[2]; ++h)
+                for(std::size_t w = 0; w < shape_nchw[3]; ++w)
+                {
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor(B_nhwc(n, h, w, c), a_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    const int N = 120;
+    const int C = 128;
+    const int H = 32;
+    const int W = 1024;
+
+    /**const int N = 120;
+    const int H = 32;
+    const int W = 64;
+
+    const int C = 128;**/
+
+    std::vector<std::size_t> nchw = {N, C, H, W};
+    std::vector<std::size_t> nhwc = {N, H, W, C};
+
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
+
+    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
+    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
+
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
+            host_b, a, nchw, PassThrough{});
+
+        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
--- a/example/45_elementwise_normalization/CMakeLists.txt
+++ b/example/45_elementwise_normalization/CMakeLists.txt
+add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp)
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+
+using ADataType             = ck::half_t; // Input 1
+using BDataType             = ck::half_t; // Input 2
+using XDataType             = ck::half_t;
+using GammaDataType         = ck::half_t;
+using BetaDataType          = ck::half_t;
+using YDataType             = ck::half_t;
+using AccDataType           = float;
+using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
+using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+// X = Elementwise(input1, input2, input3, ...)
+// Y = Layernorm(X, beta, gamma)
+using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl<
+    ck::Tuple<ADataType, BDataType>,
+    GammaDataType,
+    BetaDataType,
+    AccDataType,
+    YDataType,
+    XElementwiseOperation,
+    YElementwiseOperation,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    32,  // SliceK
+    1,   // SrcVecDim (0=M, 1=K)
+    8,   // SrcScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8>;  // OutScalarPerVector
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise2D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+        {
+            auto a_val  = A(m, n);
+            auto b_val  = B(m, n);
+            ctype c_val = 0;
+            functor(c_val, a_val, b_val);
+            C(m, n) = c_val;
+        }
+}
+
+int main()
+{
+    bool time_kernel = true;
+
+    ck::index_t M      = 48 * 256;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = N;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                    std::vector<std::size_t>({stride, 1}));
+    };
+
+    Tensor<ADataType> a(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<BDataType> b(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<GammaDataType> gamma(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta(f_host_tensor_descriptor1d(N, 1));
+    Tensor<YDataType> y(f_host_tensor_descriptor2d(M, N, Stride));
+
+    a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+    gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+    beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+
+    DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    a_dev.ToDevice(a.mData.data());
+    b_dev.ToDevice(b.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {M, N},
+        {
+            std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+        },
+        {0, 1},
+        {0, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1},
+        1e-4,
+        input,
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        XElementwiseOperation{},
+        YElementwiseOperation{});
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ela_time   = 0;
+    ela_time         = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) +
+                          M * N * sizeof(YDataType) + N * sizeof(GammaDataType) +
+                          N * sizeof(BetaDataType);
+    float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024;
+
+    std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl;
+    std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
+
+    bool pass = true;
+    {
+        std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
+                                       static_cast<unsigned long>(N)};
+        Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+        host_elementwise2D<Tensor<ADataType>,
+                           Tensor<BDataType>,
+                           Tensor<XDataType>,
+                           XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{});
+
+        Tensor<YDataType> host_y(f_host_tensor_descriptor2d(M, N, Stride));
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           AccDataType,
+                                                           YElementwiseOperation,
+                                                           Rank,
+                                                           NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &=
+            ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+        if(!(pass))
+        {
+            std::cout << "layernorm wrong" << std::endl;
+        }
+    }
+    return (pass ? 0 : 1);
+}