Merge remote-tracking branch 'origin/develop' into wavelet_model

95a83c6e · Adam Osewski · 5b7c2432 · 892a8d76 · 95a83c6e · 95a83c6e
Commit 95a83c6e authored Nov 18, 2022 by Adam Osewski
20 changed files
--- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
@@ -97,7 +97,7 @@ bool run_grouped_conv_conv_fwd(bool do_verification,
    std::array<ck::index_t, NDimSpatial> input1_left_pads{};
    std::array<ck::index_t, NDimSpatial> input1_right_pads{};
-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
    copy(in0_g_n_c_wis_desc.GetLengths(), a0_g_n_c_wis_lengths);
    copy(in0_g_n_c_wis_desc.GetStrides(), a0_g_n_c_wis_strides);
@@ -120,18 +120,14 @@ bool run_grouped_conv_conv_fwd(bool do_verification,
    const ck::index_t gemm_batch = a0_g_n_c_wis_lengths[0];
    const ck::index_t gemm0_m_length =
-        e1_g_n_k_wos_lengths[1] * std::accumulate(e1_g_n_k_wos_lengths.begin() + 3,
+        e1_g_n_k_wos_lengths[1] *
-                                                  e1_g_n_k_wos_lengths.begin() + 3 + NDimSpatial,
+        ck::accumulate_n<ck::index_t>(
-                                                  ck::index_t{1},
+            e1_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>{});
-                                                  std::multiplies<ck::index_t>{});
    const ck::index_t gemm0_n_length = b0_g_k_c_xs_lengths[1];
-    const ck::index_t gemm0_k_length =
+    const ck::index_t gemm0_k_length = ck::accumulate_n<ck::index_t>(
-        std::accumulate(b0_g_k_c_xs_lengths.begin() + 2,
+        b0_g_k_c_xs_lengths.begin() + 2, NDimSpatial + 1, 1, std::multiplies<>{});
-                        b0_g_k_c_xs_lengths.begin() + 2 + NDimSpatial + 1,
-                        ck::index_t{1},
-                        std::multiplies<ck::index_t>{});
    const ck::index_t gemm1_n_length = b1_g_k_c_xs_lengths[1];
@@ -261,7 +257,7 @@ bool run_grouped_conv_conv_fwd(bool do_verification,
 #endif
        return ck::utils::check_err(
-            out1_device.mData, out1_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+            out1_device, out1_host, "Error: incorrect results!", 1e-5f, 1e-4f);
    }
    return true;

--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
@@ -100,9 +100,9 @@ int main(int argc, char* argv[])
    Tensor<GammaDataType> gamma({G, C});
    Tensor<BetaDataType> beta({G, C});
-    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x.begin(), x.end());
+    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x);
-    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma.begin(), gamma.end());
+    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma);
-    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta.begin(), beta.end());
+    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta);
    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
@@ -128,6 +128,8 @@ int main(int argc, char* argv[])
        gamma_dev.GetDeviceBuffer(),
        beta_dev.GetDeviceBuffer(),
        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
        y_element_op);
    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
@@ -165,7 +167,7 @@ int main(int argc, char* argv[])
        ref_invoker.Run(ref_argument);
        y_dev.FromDevice(y.mData.data());
-        pass &= ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
    }
    return (pass ? 0 : 1);

--- a/example/44_conv2d_fwd_quant/CMakeLists.txt
+++ b/example/44_conv2d_fwd_quant/CMakeLists.txt
+add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
--- a/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using BiasDataType     = int32_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        8>;
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& bias_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+    bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d0_g_n_k_wos_lengths}},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d0_g_n_k_wos_strides}},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+        ref_invoker.Run(ref_argument);
+        // TODO: implement elementwise operation for host
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
+        out_device_buf.FromDevice(out_device.mData.data());
+        pass &= ck::utils::check_err(
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+    return (pass ? 0 : 1);
+}
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+    using InLayout   = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using BiasLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    // TODO - make_bias_host_tensor_descriptor_g_n_k_wos_packed()
+    const auto bias_g_k_desc = HostTensorDescriptor({conv_param.G_,
+                                                     conv_param.N_,
+                                                     conv_param.K_,
+                                                     conv_param.output_spatial_lengths_[0],
+                                                     conv_param.output_spatial_lengths_[1]},
+                                                    {
+                                                        conv_param.K_, // g
+                                                        0,             // n
+                                                        1,             // k
+                                                        0,             // ho
+                                                        0              // wo
+                                                    });
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+    std::cout << out_g_n_k_wos_desc << std::endl;
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        bias_g_k_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
--- a/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        pass &= ck::utils::check_err(
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+    return (pass ? 0 : 1);
+}
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+    using InLayout  = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck::tensor_layout::convolution::GNHWK;
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -5,6 +5,7 @@
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -69,7 +70,7 @@ int main()
                                            static_cast<int>(nhwc[2] * nhwc[3]),
                                            static_cast<int>(nhwc[3])};
-    std::copy(nchw.begin(), nchw.end(), ab_lengths.begin());
+    ck::ranges::copy(nchw, ab_lengths.begin());
    auto broadcastPermute = DeviceElementwisePermuteInstance{};
    auto argument         = broadcastPermute.MakeArgumentPointer(

--- a/example/45_elementwise_normalization/CMakeLists.txt
+++ b/example/45_elementwise_normalization/CMakeLists.txt
+add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp)
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+using ADataType             = ck::half_t; // Input 1
+using BDataType             = ck::half_t; // Input 2
+using XDataType             = ck::half_t;
+using GammaDataType         = ck::half_t;
+using BetaDataType          = ck::half_t;
+using YDataType             = ck::half_t;
+using AccDataType           = float;
+using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
+using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+// X = Elementwise(input1, input2, input3, ...)
+// Y = Layernorm(X, beta, gamma)
+using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl<
+    ck::Tuple<ADataType, BDataType>,
+    GammaDataType,
+    BetaDataType,
+    AccDataType,
+    YDataType,
+    XElementwiseOperation,
+    YElementwiseOperation,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    32,  // SliceK
+    1,   // SrcVecDim (0=M, 1=K)
+    8,   // SrcScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8>;  // OutScalarPerVector
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise2D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+        {
+            auto a_val  = A(m, n);
+            auto b_val  = B(m, n);
+            ctype c_val = 0;
+            functor(c_val, a_val, b_val);
+            C(m, n) = c_val;
+        }
+}
+int main()
+{
+    bool time_kernel = true;
+    ck::index_t M      = 48 * 256;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = N;
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                    std::vector<std::size_t>({stride, 1}));
+    };
+    Tensor<ADataType> a(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<BDataType> b(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<GammaDataType> gamma(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta(f_host_tensor_descriptor1d(N, 1));
+    Tensor<YDataType> y(f_host_tensor_descriptor2d(M, N, Stride));
+    a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+    gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+    beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+    DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    a_dev.ToDevice(a.mData.data());
+    b_dev.ToDevice(b.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+    std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {M, N},
+        {
+            std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+        },
+        {0, 1},
+        {0, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1},
+        1e-4,
+        input,
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        XElementwiseOperation{},
+        YElementwiseOperation{});
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ela_time   = 0;
+    ela_time         = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) +
+                          M * N * sizeof(YDataType) + N * sizeof(GammaDataType) +
+                          N * sizeof(BetaDataType);
+    float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024;
+    std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl;
+    std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
+    bool pass = true;
+    {
+        std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
+                                       static_cast<unsigned long>(N)};
+        Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+        host_elementwise2D<Tensor<ADataType>,
+                           Tensor<BDataType>,
+                           Tensor<XDataType>,
+                           XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{});
+        Tensor<YDataType> host_y(f_host_tensor_descriptor2d(M, N, Stride));
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           AccDataType,
+                                                           YElementwiseOperation,
+                                                           Rank,
+                                                           NumReduceDim>;
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+        y_dev.FromDevice(y.mData.data());
+        pass &=
+            ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+        if(!(pass))
+        {
+            std::cout << "layernorm wrong" << std::endl;
+        }
+    }
+    return (pass ? 0 : 1);
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -12,6 +12,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
    add_dependencies(examples ${EXAMPLE_NAME})
    add_dependencies(check ${EXAMPLE_NAME})
+    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
 endfunction(add_example_executable EXAMPLE_NAME)
 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
@@ -19,6 +20,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
    add_dependencies(examples ${EXAMPLE_NAME})
+    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 # add all example subdir

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -131,8 +131,14 @@
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
 // experimental feature: optimize for inter-wave scheduling policy
-#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1
 #define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
+// this will let make_default_loop_scheduler() return interwave scheduling flag by default
+#define CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING 0
+// experimental feature: add instances using interwave scheduling
+#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
+// experimental feature: add instances using pipeline v2
+#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
@@ -149,21 +155,17 @@
 // workaround: compiler gnerating inefficient ds_write instructions
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
-// (gfx908 only) workaround: compiler crash in fused kernels on mainline #9110; #10738 seems ok
-// error message was "fatal error: error in backend: Error while trying to spill VGPR0 from class
-// VGPR_32: Cannot scavenge register without an emergency spill slot!"
-// this fall back to less ideal way of handle NPadding in fused attention kernel
-#ifdef __gfx908__
-#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 1
-#else
-// for __gfx90a__, ...
-#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 0
-#endif // __gfx908__
 // workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0
+// workaround: a BF16 attention kernel for gfx908 is likely affected by a compiler issue
+#ifdef __gfx908__
+#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 1
+#else // __gfx90a__, ...
+#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
+#endif // __gfx908__
 namespace ck {
 enum struct InMemoryDataOperationEnum

--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -14,7 +14,8 @@ namespace ck {
 template <typename TensorLengths,
          typename DimAccessOrder,
-          typename ScalarsPerAccess> // # of scalars per access in each dimension
+          typename ScalarsPerAccess,
+          bool SnakeCurved = true> // # of scalars per access in each dimension
 struct SpaceFillingCurve
 {
    static constexpr index_t nDim = TensorLengths::Size();
@@ -136,9 +137,10 @@ struct SpaceFillingCurve
            Index ordered_idx;
            static_for<0, nDim, 1>{}([&](auto idim) {
-                ordered_idx(idim) = forward_sweep[idim] ? ordered_access_idx[idim]
+                ordered_idx(idim) =
-                                                        : ordered_access_lengths[idim] - 1 -
+                    !SnakeCurved || forward_sweep[idim]
-                                                              ordered_access_idx[idim];
+                        ? ordered_access_idx[idim]
+                        : ordered_access_lengths[idim] - 1 - ordered_access_idx[idim];
            });
            return container_reorder_given_old2new(ordered_idx, dim_access_order) *

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -18,11 +18,11 @@ enum struct LoopScheduler
 constexpr LoopScheduler make_default_loop_scheduler()
 {
-#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
    return LoopScheduler::Interwave;
 #else
    return LoopScheduler::Default;
-#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
 }
 template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
@@ -151,6 +151,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
        return make_tuple(c_thread_m, c_thread_n);
    }
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+        return make_tuple(Number<m0>{},
+                          Number<n0>{},
+                          waveId_m,
+                          waveId_n,
+                          blk_idx[I0],
+                          blk_idx[I1],
+                          blk_idx[I2],
+                          blk_idx[I3]);
+    }
    __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1()
    {
        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
@@ -724,6 +745,21 @@ struct BlockwiseGemmXdlops_v2
        return make_tuple(c_thread_m, c_thread_n);
    }
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+        return make_tuple(
+            m0, n0, waveId_m, waveId_n, blk_idx[I0], blk_idx[I1], blk_idx[I2], blk_idx[I3]);
+    }
    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
    __host__ __device__ BlockwiseGemmXdlops_v2(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -24,7 +24,8 @@ template <typename ALayout,
          typename B0ElementwiseOperation,
          typename Acc0ElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          bool MaskOutUpperTriangle> // TODO: enum for mask type
 struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -7,49 +7,60 @@
 #include <vector>
 #include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename ALayout,
+template <index_t NumDimG,
-          typename B0Layout,
+          index_t NumDimM,
-          typename B1Layout,
+          index_t NumDimN,
-          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<>
+          index_t NumDimK,
+          index_t NumDimO,
          typename ADataType,
          typename B0DataType,
          typename B1DataType,
          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
          typename AElementwiseOperation,
          typename B0ElementwiseOperation,
          typename Acc0ElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
 struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
 {
-    virtual std::unique_ptr<BaseArgument>
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
-    MakeArgumentPointer(const void* p_a,
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
-                        const void* p_b0,
-                        const void* p_b1,
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
-                        void* p_c,
+        const void* p_a,
-                        ck::index_t M,
+        const void* p_b0,
-                        ck::index_t N,
+        const void* p_b1,
-                        ck::index_t K,
+        void* p_c,
-                        ck::index_t O,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-                        ck::index_t Batch,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-                        std::vector<index_t> c_gs_ms_os_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
-                        std::vector<index_t> c_gs_ms_os_strides,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
-                        ck::index_t StrideA,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
-                        ck::index_t StrideB0,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
-                        ck::index_t StrideB1,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
-                        ck::index_t BatchStrideA,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
-                        ck::index_t BatchStrideB0,
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
-                        ck::index_t BatchStrideB1,
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
-                        AElementwiseOperation a_element_op,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
-                        B0ElementwiseOperation b0_element_op,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
-                        Acc0ElementwiseOperation acc0_element_op,
+        const std::array<std::vector<index_t>, NumAcc1Bias>
-                        B1ElementwiseOperation b1_element_op,
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
-                        CElementwiseOperation c_element_op) = 0;
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        B0ElementwiseOperation b0_element_op,
+        Acc0ElementwiseOperation acc0_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -13,31 +13,36 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <index_t Rank, index_t NumBatchNormReduceDim>
+template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
 struct DeviceBatchNormFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
        const std::array<index_t, Rank> xyLengths,
        const std::array<index_t, Rank> xStrides,
        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
-        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
        const void* p_x,
        const void* bnScale,
        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
        double exponentialAverageFactor,
        void* resultRunningMean,
-        void* resultRunningVariance,
+        void* resultRunningVariance) = 0;
-        double epsilon,
-        void* resultSaveMean,
-        void* resultSaveInvVariance) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <index_t Rank, index_t NumBatchNormReduceDim>
+template <index_t Rank, index_t NumBatchNormReduceDim, typename YElementwiseOp>
-using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim>>;
+using DeviceBatchNormFwdPtr =
+    std::unique_ptr<DeviceBatchNormFwd<Rank, NumBatchNormReduceDim, YElementwiseOp>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -21,7 +21,9 @@ struct DeviceBatchNormInfer : public BaseOperator
        const std::array<index_t, Rank> xStrides,
        const std::array<index_t, Rank> yStrides,
        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
-        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
        const void* p_x,
        const void* bnScale,
        const void* bnBias,

--- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename XElementwiseOperation,
+          typename YElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceElementwiseNormalization : public BaseOperator
+{
+    static constexpr int NumInput = InDataTypeTuple::Size();
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::array<std::vector<index_t>, NumInput> inStridesArray,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        XElementwiseOperation x_elementwise_op,
+                        YElementwiseOperation y_elementwise_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename InDataTypeTuple,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename XElementwiseOperation,
+          typename YElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceElementwiseNormalizationPtr =
+    std::unique_ptr<DeviceElementwiseNormalization<InDataTypeTuple,
+                                                   GammaDataType,
+                                                   BetaDataType,
+                                                   AccDataType,
+                                                   YDataType,
+                                                   XElementwiseOperation,
+                                                   YElementwiseOperation,
+                                                   Rank,
+                                                   NumReduceDim>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -3,7 +3,7 @@
 #pragma once
-#include <vector>
+#include <array>
 #include "ck/tensor_operation/gpu/device/device_base.hpp"

--- a/include/ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp
@@ -3,7 +3,7 @@
 #pragma once
-#include <vector>
+#include <array>
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
@@ -11,7 +11,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <ck::index_t NumDimSpatial,
+template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename OutLayout,
@@ -21,22 +21,23 @@ template <ck::index_t NumDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation>
-struct DeviceConvBwdWeight : public BaseOperator
+struct DeviceGroupedConvBwdWeight : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_in,
                        void* p_wei,
                        const void* p_out,
+                        ck::index_t G,
                        ck::index_t N,
                        ck::index_t K,
                        ck::index_t C,
-                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
-                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
-                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
-                        std::vector<ck::index_t> conv_filter_strides,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
-                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
-                        std::vector<ck::index_t> input_left_pads,
+                        std::array<ck::index_t, NDimSpatial> input_left_pads,
-                        std::vector<ck::index_t> input_right_pads,
+                        std::array<ck::index_t, NDimSpatial> input_right_pads,
                        InElementwiseOperation in_element_op,
                        WeiElementwiseOperation wei_element_op,
                        OutElementwiseOperation out_element_op,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// Convolution Forward:
+//   input : input image A[G, N, C, Hi, Wi],
+//   input : weight B[G, K, C, Y, X],
+//   output : output image E[G, N, K, Ho, Wo]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceGroupedConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,  // input image
+                        const void* p_wei, // weight
+                        void* p_out,       // output image
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const InElementwiseOperation& in_element_op,
+                        const WeiElementwiseOperation& wei_element_op,
+                        const OutElementwiseOperation& out_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck