Merge branch 'develop' into gemm_layernorm_welford

24af0144 · Po Yen Chen · GitHub · 961f5e9e · b79bbbc2 · 24af0144
Unverified Commit 24af0144 authored Nov 12, 2022 by Po Yen Chen Committed by GitHub Nov 12, 2022
20 changed files
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -14,6 +14,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_host_result(
+    Tensor<ReduceDataType> reduce0_m_host_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_host_result({M});
-    Tensor<ReduceDataType> reduce1_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_device_result(
+    Tensor<ReduceDataType> reduce0_m_device_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_device_result({M});
-    Tensor<ReduceDataType> reduce1_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
-                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
-                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
                if(do_log)
                {

--- a/profiler/include/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profile_gemm_splitk_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                pass =
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                    pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
                if(do_log)
                {

--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -3,9 +3,10 @@
 #pragma once
-#include "ck/ck.hpp"
+#include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <iterator>
 #include <typeinfo>
 #include "ck/ck.hpp"
@@ -13,7 +14,7 @@
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -26,32 +27,6 @@
 namespace ck {
 namespace profiler {
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
          typename OutDataType>
-bool profile_conv_bwd_weight_impl(int do_verification,
+bool profile_grouped_conv_bwd_weight_impl(int do_verification,
-                                  int init_method,
+                                          int init_method,
-                                  bool do_log,
+                                          bool do_log,
-                                  bool time_kernel,
+                                          bool time_kernel,
-                                  const ck::utils::conv::ConvParam& conv_param,
+                                          const ck::utils::conv::ConvParam& conv_param,
-                                  ck::index_t split_k)
+                                          ck::index_t split_k)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+        auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
                                                                           InDataType,
                                                                           WeiDataType,
                                                                           OutDataType,
                                                                           InElementOp,
                                                                           WeiElementOp,
                                                                           OutElementOp>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_invoker = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(input,
                                                  weight_host_result,
                                                  output,
@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
-    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
-                                                                       InLayout,
+                                                                              InLayout,
-                                                                       WeiLayout,
+                                                                              WeiLayout,
-                                                                       OutLayout,
+                                                                              OutLayout,
-                                                                       InDataType,
+                                                                              InDataType,
-                                                                       WeiDataType,
+                                                                              WeiDataType,
-                                                                       OutDataType,
+                                                                              OutDataType,
-                                                                       InElementOp,
+                                                                              InElementOp,
-                                                                       WeiElementOp,
+                                                                              WeiElementOp,
-                                                                       OutElementOp>;
+                                                                              OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
    // profile device Conv instances
    bool all_pass = true;
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+    range_copy(conv_param.input_right_pads_, begin(input_right_pads));
    for(auto& op_ptr : op_ptrs)
    {
        auto argument_ptr =
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        conv_param.G_,
                                        conv_param.N_,
                                        conv_param.K_,
                                        conv_param.C_,
-                                        conv_param.input_spatial_lengths_,
+                                        input_spatial_lengths,
-                                        conv_param.filter_spatial_lengths_,
+                                        filter_spatial_lengths,
-                                        conv_param.output_spatial_lengths_,
+                                        output_spatial_lengths,
-                                        conv_param.conv_filter_strides_,
+                                        conv_filter_strides,
-                                        conv_param.conv_filter_dilations_,
+                                        conv_filter_dilations,
-                                        conv_param.input_left_pads_,
+                                        input_left_pads,
-                                        conv_param.input_right_pads_,
+                                        input_right_pads,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op,
@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
            {
                wei_device_buf.FromDevice(weight_device_result.mData.data());
-                bool pass =
+                bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
-                    ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
                if(!pass)
                {
-                    std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl;
+                    std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
                }
                all_pass &= pass;
                if(do_log)
                {
-                    std::cout << "in : ";
+                    LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
-                    show_data_nhwc_layout(output);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (device): ", weight_device_result.mData, ",")
-                    std::cout << "wei: ";
+                        << std::endl;
-                    show_data_nhwc_layout(weight_host_result);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (host): ", weight_host_result.mData, ",")
-                    std::cout << "out  : ";
+                        << std::endl;
-                    show_data_nhwc_layout(input);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
+                    ;
-                    std::cout << "wei_device: ";
-                    show_data_nhwc_layout(weight_device_result);
-                    std::cout << std::endl;
                }
            }
        }

--- a/profiler/include/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp
@@ -9,11 +9,12 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
    std::array<ck::index_t, NDimSpatial> input_right_pads{};
-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 ck::Tuple<>,
-                                                                                 OutLayout,
-                                                                                 InDataType,
-                                                                                 WeiDataType,
-                                                                                 ck::Tuple<>,
-                                                                                 OutDataType,
-                                                                                 InElementOp,
-                                                                                 WeiElementOp,
-                                                                                 OutElementOp>;
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
    std::string best_op_name;
    float best_avg_time   = 0;
    float best_tflops     = 0;
@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    // profile device op instances
    bool pass = true;
-    for(auto& op_ptr : op_ptrs)
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
-    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
-                                        wei_device_buf.GetDeviceBuffer(),
-                                        std::array<const void*, 0>{},
-                                        out_device_buf.GetDeviceBuffer(),
-                                        a_g_n_c_wis_lengths,
-                                        a_g_n_c_wis_strides,
-                                        b_g_k_c_xs_lengths,
-                                        b_g_k_c_xs_strides,
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                        e_g_n_k_wos_lengths,
-                                        e_g_n_k_wos_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        in_element_op,
-                                        wei_element_op,
-                                        out_element_op);
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // re-init output to zero before profiling next kernel
@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
            {
                out_device_buf.FromDevice(device_output.mData.data());
-                pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
+                pass = pass & ck::utils::check_err(device_output, host_output);
                if(do_log)
                {
@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        {
            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
        }
+    };
+    // xdl
+    {
+        using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                                                     InLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     OutLayout,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+        // get device op instances
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+        std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                            wei_device_buf.GetDeviceBuffer(),
+                                                            {},
+                                                            out_device_buf.GetDeviceBuffer(),
+                                                            a_g_n_c_wis_lengths,
+                                                            a_g_n_c_wis_strides,
+                                                            b_g_k_c_xs_lengths,
+                                                            b_g_k_c_xs_strides,
+                                                            {},
+                                                            {},
+                                                            e_g_n_k_wos_lengths,
+                                                            e_g_n_k_wos_strides,
+                                                            conv_filter_strides,
+                                                            conv_filter_dilations,
+                                                            input_left_pads,
+                                                            input_right_pads,
+                                                            in_element_op,
+                                                            wei_element_op,
+                                                            out_element_op);
+            run_impl(op_ptr, argument_ptr);
+        }
+    }
+    // dl
+    {
+        using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwd<NDimSpatial,
+                                                                            InLayout,
+                                                                            WeiLayout,
+                                                                            OutLayout,
+                                                                            InDataType,
+                                                                            WeiDataType,
+                                                                            OutDataType,
+                                                                            InElementOp,
+                                                                            WeiElementOp,
+                                                                            OutElementOp>;
+        // get device op instances
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+        std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                            wei_device_buf.GetDeviceBuffer(),
+                                                            out_device_buf.GetDeviceBuffer(),
+                                                            a_g_n_c_wis_lengths,
+                                                            a_g_n_c_wis_strides,
+                                                            b_g_k_c_xs_lengths,
+                                                            b_g_k_c_xs_strides,
+                                                            e_g_n_k_wos_lengths,
+                                                            e_g_n_k_wos_strides,
+                                                            conv_filter_strides,
+                                                            conv_filter_dilations,
+                                                            input_left_pads,
+                                                            input_right_pads,
+                                                            in_element_op,
+                                                            wei_element_op,
+                                                            out_element_op);
+            run_impl(op_ptr, argument_ptr);
+        }
    }
    std::cout << "Best configuration parameters:"

--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -17,6 +17,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                              c_element_op);
                    ref_invoker.Run(ref_argument);
-                    pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
+                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
-                                                        c_m_n_host_result.mData);
                    if(do_log)
                    {

--- a/profiler/include/profile_groupnorm_impl.hpp
+++ b/profiler/include/profile_groupnorm_impl.hpp
@@ -7,7 +7,7 @@
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification,
    beta_dev.ToDevice(beta.mData.data());
    // add device normalization instances
-    using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
-                                                                   GammaDataType,
+                                                                       GammaDataType,
-                                                                   BetaDataType,
+                                                                       BetaDataType,
-                                                                   AccDataType,
+                                                                       AccDataType,
-                                                                   YDataType,
+                                                                       YDataType,
-                                                                   PassThrough,
+                                                                       PassThrough,
-                                                                   5,
+                                                                       5,
-                                                                   3>;
+                                                                       3>;
    // get device op instances
    const auto instance_ptrs =
@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
            gamma_dev.GetDeviceBuffer(),
            beta_dev.GetDeviceBuffer(),
            y_dev.GetDeviceBuffer(),
+            nullptr,
+            nullptr,
            PassThrough{});
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
        {
            y_dev.FromDevice(y.mData.data());
-            bool pass =
+            bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
-                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
            if(do_log)
            {
@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification,
    if(num_kernel == 0)
    {
-        std::cout << "Error: No kernel is tested" << std::endl;
+        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }

--- a/profiler/include/profile_layernorm_impl.hpp
+++ b/profiler/include/profile_layernorm_impl.hpp
@@ -6,9 +6,7 @@
 #include <iomanip>
 #include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
-#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -24,35 +22,36 @@ template <typename XDataType,
          typename AccDataType,
          typename YDataType,
          index_t Rank>
-void profile_layernorm_impl(int do_verification,
+bool profile_layernorm_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
-                            std::vector<index_t> length,
+                            std::vector<index_t> length)
-                            std::vector<index_t> strideXY,
-                            std::vector<index_t> strideGamma,
-                            std::vector<index_t> strideBeta)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    if(length.size() < 2)
-        return;
+        return false;
-    // Assume normalize dimension except for first dimension
+    // Assume normalize dimension except for batch (first) dimension
    std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
    std::vector<index_t> reduce_dim;
    for(int i = 1; i < Rank; ++i)
        reduce_dim.push_back(i);
    Tensor<XDataType> x(length);
-    Tensor<GammaDataType> gamma(reduce_length, strideGamma);
+    Tensor<GammaDataType> gamma(reduce_length);
-    Tensor<BetaDataType> beta(reduce_length, strideBeta);
+    Tensor<BetaDataType> beta(reduce_length);
-    Tensor<YDataType> y(length, strideXY);
+    Tensor<YDataType> y(length);
-    Tensor<YDataType> host_y(length, strideXY);
+    Tensor<YDataType> host_y(length);
+    std::vector<index_t> strideXY =
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
+    std::vector<index_t> strideGammaBeta = strideXY;
+    strideGammaBeta[0]                   = 0;
    switch(init_method)
    {
-    // case 0: break;
    case 0:
        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
@@ -84,14 +83,14 @@ void profile_layernorm_impl(int do_verification,
    constexpr int NumReduceDim = Rank - 1;
    // add device normalization instances
-    using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
-                                                                   GammaDataType,
+                                                                       GammaDataType,
-                                                                   BetaDataType,
+                                                                       BetaDataType,
-                                                                   AccDataType,
+                                                                       AccDataType,
-                                                                   YDataType,
+                                                                       YDataType,
-                                                                   PassThrough,
+                                                                       PassThrough,
-                                                                   Rank,
+                                                                       Rank,
-                                                                   NumReduceDim>;
+                                                                       NumReduceDim>;
    // get device op instances
    const auto instance_ptrs =
@@ -122,12 +121,14 @@ void profile_layernorm_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
+    int num_kernel = 0;
    for(auto& inst_ptr : instance_ptrs)
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
                                                          strideXY,
-                                                          strideGamma,
+                                                          strideGammaBeta,
-                                                          strideBeta,
+                                                          strideGammaBeta,
                                                          strideXY,
                                                          reduce_dim,
                                                          1e-4,
@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
                                                          gamma_dev.GetDeviceBuffer(),
                                                          beta_dev.GetDeviceBuffer(),
                                                          y_dev.GetDeviceBuffer(),
+                                                          nullptr,
+                                                          nullptr,
                                                          PassThrough{});
-        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
-            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+            ++num_kernel;
-            LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
            continue;
        }
@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+        if(time_kernel)
-                  << inst_ptr->GetTypeString() << std::endl;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
        if(avg_time < best_avg_time)
        {
@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
            {
                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
-                return;
+                return false;
            }
            else
            {
-                std::cout << "pass" << std::endl;
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
            }
        }
    }
-    LogRange(std::cout << "length = ", length, ",") << ", ";
+    if(time_kernel)
-    LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
+    {
-    LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        LogRange(std::cout << "length = ", length, ",") << ", ";
-    std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+        LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
-              << best_instance_name << std::endl;
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return true;
 }
 } // namespace profiler

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -18,57 +18,61 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
+template <index_t Rank,
+          index_t NumReduceDim,
+          ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool UseIndex>
 struct ReduceDescription
 {
-    static constexpr int Rank_         = Rank;
+    static constexpr index_t Rank_              = Rank;
-    static constexpr int NumReduceDim_ = NumReduceDim;
+    static constexpr index_t NumReduceDim_      = NumReduceDim;
-    static constexpr int ReduceOpId_   = ReduceOpId;
+    static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
-    static constexpr int PropagateNan_ = PropagateNan;
+    static constexpr bool PropagateNan_         = PropagateNan;
-    static constexpr int UseIndex_     = UseIndex;
+    static constexpr bool UseIndex_             = UseIndex;
 };
 using reduce_description_instances =
-    std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
+    std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
-               ReduceDescription<4, 4, 0, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<4, 1, 0, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<2, 1, 0, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<4, 3, 5, false, false>, // for AVG
+               ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
-               ReduceDescription<4, 4, 5, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<4, 1, 5, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<2, 1, 5, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<4, 3, 7, false, false>, // for NORM2
+               ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
-               ReduceDescription<4, 4, 7, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<4, 1, 7, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<2, 1, 7, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<4, 3, 2, false, false>, // for MIN
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
-               ReduceDescription<4, 4, 2, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<4, 1, 2, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<2, 1, 2, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<4, 3, 3, false, false>, // for MAX
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
-               ReduceDescription<4, 4, 3, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<4, 1, 3, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<2, 1, 3, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<4, 3, 4, false, false>, // for AMAX
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
-               ReduceDescription<4, 4, 4, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<4, 1, 4, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<2, 1, 4, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<4, 3, 2, false, true>, // for MIN
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
-               ReduceDescription<4, 4, 2, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<4, 1, 2, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<2, 1, 2, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<4, 3, 3, false, true>, // for MAX
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
-               ReduceDescription<4, 4, 3, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<4, 1, 3, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<2, 1, 3, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<4, 3, 4, false, true>, // for AMAX
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
-               ReduceDescription<4, 4, 4, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
-               ReduceDescription<4, 1, 4, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
-               ReduceDescription<2, 1, 4, false, true>>;
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
 template <typename DescriptionType>
 bool description_match(const DescriptionType& description,
@@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description,
                       bool PropagateNan,
                       bool UseIndex)
 {
-    if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
+    if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
-       description.PropagateNan_ != static_cast<int>(PropagateNan) ||
+       description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
-       description.UseIndex_ != static_cast<int>(UseIndex))
        return (false);
    if(DescriptionType::NumReduceDim_ != reduceDims.size())
@@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description,
 namespace ck {
 namespace profiler {
-template <index_t Rank, index_t NumReduceDim>
+template <int Rank, int NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
 {
-    assert(NumReduceDim == reduceDims.size());
    int reduceFlag = 0;
    // flag the bits for the reduceDims
@@ -112,13 +114,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
        reduceFlag |= 1 << reduceDims[i];
    };
-    std::vector<int> invariantDims;
+    std::array<int, Rank - NumReduceDim> invariantDims;
    // collect invariant dimensions
+    int dim = 0;
    for(int i = 0; i < Rank; i++)
        if((reduceFlag & (1 << i)) == 0)
        {
-            invariantDims.push_back(i);
+            invariantDims[dim] = i;
+            dim++;
        };
    return invariantDims;
@@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification,
                              bool do_dumpout,
                              bool time_kernel,
                              const std::vector<size_t>& inLengths,
-                              const std::vector<int>& reduceDims,
+                              const std::array<int, NumReduceDim>& reduceDims,
                              float alpha,
                              float beta)
 {
@@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification,
    using namespace ck::tensor_operation::device::instance;
    using ck::host_common::dumpBufferToFile;
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
    constexpr bool op_support_indices =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
         ReduceOpId == ReduceTensorOp::AMAX);
@@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification,
            reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
                static_cast<int32_t>(reduce_total_length));
-        using DeviceReduceInstPtr0 =
+        using DeviceReduceInstPtr =
-            DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
+            DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
-        std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
+        std::vector<DeviceReduceInstPtr> reduce_ptrs;
        add_device_reduce_instance_threadwise<InDataType,
                                              AccDataType,
                                              OutDataType,
                                              Rank,
                                              NumReduceDim,
-                                              ReduceOpId,
+                                              ReduceOperation,
+                                              InElementwiseOperation,
+                                              AccElementwiseOperation,
                                              PropagateNan,
-                                              UseIndex>(reduce0_ptrs);
+                                              UseIndex>(reduce_ptrs);
        add_device_reduce_instance_blockwise<InDataType,
                                             AccDataType,
                                             OutDataType,
                                             Rank,
                                             NumReduceDim,
-                                             ReduceOpId,
+                                             ReduceOperation,
+                                             InElementwiseOperation,
+                                             AccElementwiseOperation,
                                             PropagateNan,
-                                             UseIndex>(reduce0_ptrs);
+                                             UseIndex>(reduce_ptrs);
        if constexpr(use_atomic_add)
        {
@@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification,
                                                             OutDataType,
                                                             Rank,
                                                             NumReduceDim,
-                                                             ReduceOpId,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
                                                             PropagateNan,
-                                                             UseIndex>(reduce0_ptrs);
+                                                             UseIndex>(reduce_ptrs);
        }
-        if(reduce0_ptrs.empty())
+        if(reduce_ptrs.empty())
        {
            throw std::runtime_error("Wrong! No device REDUCE instance found");
        };
@@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification,
                           acc_elementwise_op);
        };
-        std::vector<ck::index_t> i_inLengths;
+        std::array<index_t, Rank> arrInLengths;
-        std::vector<ck::index_t> i_inStrides;
+        std::array<index_t, Rank> arrInStrides;
-        std::vector<ck::index_t> i_outLengths;
+        std::array<index_t, NumOutDim> arrOutLengths;
-        std::vector<ck::index_t> i_outStrides;
+        std::array<index_t, NumOutDim> arrOutStrides;
-        i_inLengths.assign(inLengths.begin(), inLengths.end());
+        std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
-        i_inStrides.assign(inStrides.begin(), inStrides.end());
+        std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
-        i_outLengths.assign(outLengths.begin(), outLengths.end());
+        std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
-        i_outStrides.assign(outStrides.begin(), outStrides.end());
+        std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
-        for(auto& reduce_ptr : reduce0_ptrs)
+        for(auto& reduce_ptr : reduce_ptrs)
        {
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
-                                                                i_inStrides,
+                                                                arrInStrides,
-                                                                i_outLengths,
+                                                                arrOutLengths,
-                                                                i_outStrides,
+                                                                arrOutStrides,
                                                                reduceDims,
                                                                alpha,
                                                                beta,
@@ -399,13 +411,12 @@ bool profile_reduce_impl_impl(bool do_verification,
                bool single_pass;
                out_dev.FromDevice(out.mData.data());
-                single_pass = ck::utils::check_err(out.mData, out_ref.mData);
+                single_pass = ck::utils::check_err(out, out_ref);
                if(OutputIndex)
                {
                    out_indices_dev.FromDevice(out_indices.mData.data());
-                    single_pass = single_pass &&
+                    single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
-                                  ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
                };
                if(!single_pass)
@@ -478,22 +489,25 @@ bool profile_reduce_impl(bool do_verification,
               descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
            return;
-        pass = pass &&
+        std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
-               profile_reduce_impl_impl<InDataType,
-                                        AccDataType,
+        std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
-                                        OutDataType,
-                                        descType::Rank_,
+        pass = pass && profile_reduce_impl_impl<InDataType,
-                                        descType::NumReduceDim_,
+                                                AccDataType,
-                                        static_cast<ReduceTensorOp>(descType::ReduceOpId_),
+                                                OutDataType,
-                                        static_cast<bool>(descType::PropagateNan_),
+                                                descType::Rank_,
-                                        static_cast<bool>(descType::UseIndex_)>(do_verification,
+                                                descType::NumReduceDim_,
-                                                                                init_method,
+                                                static_cast<ReduceTensorOp>(descType::ReduceOpId_),
-                                                                                do_dumpout,
+                                                descType::PropagateNan_,
-                                                                                time_kernel,
+                                                descType::UseIndex_>(do_verification,
-                                                                                inLengths,
+                                                                     init_method,
-                                                                                reduceDims,
+                                                                     do_dumpout,
-                                                                                alpha,
+                                                                     time_kernel,
-                                                                                beta);
+                                                                     inLengths,
+                                                                     arrReduceDims,
+                                                                     alpha,
+                                                                     beta);
        matched = true;
    });

--- a/profiler/include/profile_normalization_impl.hpp
+++ b/profiler/include/profile_normalization_impl.hpp
@@ -3,55 +3,27 @@
 #pragma once
+#include <algorithm>
 #include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/utility/data_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-namespace {
-using F16         = ck::half_t;
-using F32         = float;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
 namespace ck {
 namespace profiler {
-enum struct NormType
+enum struct SoftmaxDataType
-{
-    BATCHNORM,
-    SOFTMAX,
-};
-enum struct NormDataType
 {
    F32_F32, // in, out
    F16_F16,
@@ -60,7 +32,7 @@ enum struct NormDataType
 };
 // clang-format off
-template <typename NormDataType> std::string type_to_string();
+template <typename SoftmaxDataType> std::string type_to_string();
 template <> std::string type_to_string<float>()   { return "f32"; }
 template <> std::string type_to_string<half_t>()  { return "f16"; }
 template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
@@ -69,16 +41,15 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
 template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
-void profile_normalization_impl(int do_verification,
+bool profile_softmax_impl(int do_verification,
-                                int init_method,
+                          int init_method,
-                                bool do_log,
+                          bool do_log,
-                                bool time_kernel,
+                          bool time_kernel,
-                                std::vector<index_t> in_length,
+                          std::vector<index_t> in_length,
-                                std::vector<index_t> in_strides,
+                          std::vector<index_t> in_strides,
-                                std::vector<index_t> reduce_dims,
+                          std::vector<index_t> reduce_dims,
-                                AccDataType alpha,
+                          AccDataType alpha,
-                                AccDataType beta,
+                          AccDataType beta)
-                                NormType norm_type)
 {
    if(Rank != in_length.size())
    {
@@ -88,62 +59,46 @@ void profile_normalization_impl(int do_verification,
    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
    Tensor<OutDataType> out(in.mDesc);
+    Tensor<OutDataType> prior_out(in.mDesc);
    switch(init_method)
    {
-    // case 0: break;
+    case 0: break;
-    case 0:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
-        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
-        break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
-        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
+                                                                               prior_out.end());
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
-        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
    }
-    Tensor<OutDataType> out_ref(out);
+    Tensor<OutDataType> out_ref(prior_out);
+    if(do_verification)
+    {
+        using ReferenceSoftmax =
+            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+        ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
+    }
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
-    in_dev.ToDevice(in.mData.data());
+    in_dev.ToDevice(in.data());
-    out_dev.ToDevice(out.mData.data());
-    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
+    std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
-    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
+    std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOpPtr = tensor_operation::device::
+    using DeviceOp    = tensor_operation::device::
-        DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
-    std::vector<DeviceOpPtr> instances;
-    if(norm_type == NormType::SOFTMAX)
+    // get device op instances
-    {
+    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
+        DeviceOp>::GetInstances();
-                     is_same<AccDataType, float>::value)
+    std::cout << "found " << instances.size() << " instances" << std::endl;
-        {
-            if constexpr(Rank == 3)
-                tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
-                    instances);
-            else if constexpr(Rank == 4)
-                tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
-                    instances);
-        }
-        else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
-                          is_same<AccDataType, float>::value)
-        {
-            if constexpr(Rank == 3)
-                tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
-                    instances);
-            else if constexpr(Rank == 4)
-                tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
-                    instances);
-        }
-    }
    if(instances.size() <= 0)
    {
@@ -153,21 +108,19 @@ void profile_normalization_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    std::vector<bool> instance_pass;
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    for(auto& inst_ptr : instances)
    {
        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) &&
+        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-             inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
        {
            continue;
        }
-        auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths,
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
-                                                          i_in_strides,
+                                                          in_tensor_strides,
                                                          reduce_dims,
                                                          &alpha,
                                                          &beta,
@@ -181,45 +134,42 @@ void profile_normalization_impl(int do_verification,
            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
            LogRange(std::cout << "input lengths = [", in_length, ", ")
                << "], "
-                << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+                << "scaler = [" << alpha << ", " << beta << "]";
-            return;
+            LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
+            instance_pass.push_back(true);
+            continue;
        }
+        out_dev.ToDevice(prior_out.data());
        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        if(time_kernel)
+        {
-        std::size_t num_bytes =
+            std::size_t num_bytes =
-            in.mDesc.GetElementSize() * sizeof(InDataType) +
+                in.GetElementSize() * sizeof(InDataType) +
-            (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+                (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
-                  << inst_ptr->GetTypeString() << std::endl;
+                      << inst_ptr->GetTypeString() << std::endl;
-        if(avg_time < best_avg_time)
+            if(avg_time < best_avg_time)
-        {
+            {
-            best_instance_name = inst_ptr->GetTypeString();
+                best_instance_name = inst_ptr->GetTypeString();
-            best_avg_time      = avg_time;
+                best_avg_time      = avg_time;
-            best_gb_per_sec    = gb_per_sec;
+                best_gb_per_sec    = gb_per_sec;
+            }
        }
        if(do_verification)
        {
-            // TODO: factory method to dynamically switch between different reference normalizations
+            out_dev.FromDevice(out.data());
-            using ReferenceFactory =
+            bool pass = true;
-                tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
-            ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
-            out_dev.FromDevice(out.mData.data());
-            bool pass;
            if(std::is_same<InDataType, int8_t>::value)
            {
-                pass = ck::utils::check_err(
+                pass = pass && ck::utils::check_err(
-                    out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                                   out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
                if(do_log)
                {
                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
@@ -230,7 +180,7 @@ void profile_normalization_impl(int do_verification,
            }
            else
            {
-                pass = ck::utils::check_err(out.mData, out_ref.mData);
+                pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
                if(do_log)
                {
                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
@@ -247,16 +197,22 @@ void profile_normalization_impl(int do_verification,
                    << "], "
                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
            }
+            instance_pass.push_back(pass);
        }
    }
-    std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
+    if(time_kernel)
-              << type_to_string<OutDataType>() << ", ";
+    {
-    LogRange(std::cout << "length = ", i_in_lengths, ",") << ", ";
+        std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
-    LogRange(std::cout << "stride = ", i_in_strides, ",") << ", ";
+                  << type_to_string<OutDataType>() << ", ";
-    LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
+        LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
-    std::cout << "alpha = " << alpha << ", "
+        LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
-              << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+        LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
-              << " GB/s, " << best_instance_name << std::endl;
+        std::cout << "alpha = " << alpha << ", "
+                  << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+                  << " GB/s, " << best_instance_name << std::endl;
+    }
+    return std::all_of(
+        std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
 }
 } // namespace profiler

--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
+#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
 namespace {
 enum struct ConvLayout
 {
-    NCHW_KCYX_NKHW, // 0
+    GNCHW_GKCYX_GNKHW, // 0
-    NHWC_KYXC_NHWK, // 1
+    GNHWC_GKYXC_GNHWK, // 1
 };
 enum struct ConvDataType
@@ -25,24 +25,25 @@ enum struct ConvDataType
 static void print_helper_msg()
 {
-    std::cout
+    std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
-        << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
+              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
-        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+              << "                 1: Input fp16, Weight fp16, Output fp16\n"
-        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
-        << "                 2: Input bf16, Weight fp32, Output bf16)\n"
+              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
-        << "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
+                 "N, K, Ho, Wo]\n"
-        << "                     1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n"
+              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
-        << "arg4: verification (0: no, 1: yes)\n"
+                 "N, Ho, Wo, K]\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+              << "arg4: verification (0: no, 1: yes)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
-        << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
-        << std::endl;
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << std::endl;
 }
 } // namespace
-int profile_conv_bwd_weight(int argc, char* argv[])
+int profile_grouped_conv_bwd_weight(int argc, char* argv[])
 {
    // 8 for control, 1 for num_dim_spatial
    if(argc < 9)
@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[])
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-    using NWC   = ck::tensor_layout::convolution::NWC;
+    using GNWC   = ck::tensor_layout::convolution::GNWC;
-    using NHWC  = ck::tensor_layout::convolution::NHWC;
+    using GNHWC  = ck::tensor_layout::convolution::GNHWC;
-    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+    using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-    using KXC   = ck::tensor_layout::convolution::KXC;
+    using GKXC   = ck::tensor_layout::convolution::GKXC;
-    using KYXC  = ck::tensor_layout::convolution::KYXC;
+    using GKYXC  = ck::tensor_layout::convolution::GKYXC;
-    using KZYXC = ck::tensor_layout::convolution::KZYXC;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-    using NWK   = ck::tensor_layout::convolution::NWK;
+    using GNWK   = ck::tensor_layout::convolution::GNWK;
-    using NHWK  = ck::tensor_layout::convolution::NHWK;
+    using GNHWK  = ck::tensor_layout::convolution::GNHWK;
-    using NDHWK = ck::tensor_layout::convolution::NDHWK;
+    using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
    constexpr auto I1 = ck::Number<1>{};
    constexpr auto I2 = ck::Number<2>{};
@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[])
        using WeiDataType = decltype(wei_type);
        using OutDataType = decltype(out_type);
-        bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial,
+        bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
-                                                               InLayout,
+                                                                       InLayout,
-                                                               WeiLayout,
+                                                                       WeiLayout,
-                                                               OutLayout,
+                                                                       OutLayout,
-                                                               InDataType,
+                                                                       InDataType,
-                                                               WeiDataType,
+                                                                       WeiDataType,
-                                                               OutDataType>(
+                                                                       OutDataType>(
            do_verification, init_method, do_log, time_kernel, params, split_k);
        return pass ? 0 : 1;
    };
-    if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
        }
    }

--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -12,8 +12,7 @@ using ck::index_t;
 struct LayernormArgParser
 {
-    std::unordered_map<std::string, std::vector<int>> long_opts = {
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
-        {"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}};
    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
    {
@@ -52,9 +51,6 @@ void print_help_layernorm()
              << "arg4: print tensor value (0: no; 1: yes)\n"
              << "arg5: time kernel (0=no, 1=yes)\n"
              << "--length: tensor extents (e.g, --length 1024 1024) \n"
-              << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
-              << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
-              << "--strideBeta: tensor strides (e.g, --strideBeta 1)\n"
              << std::endl;
 }
@@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[])
    // parse the long options
    arg_parser(argc, argv);
-    const std::vector<index_t> length      = arg_parser.long_opts["length"];
+    const std::vector<index_t> length = arg_parser.long_opts["length"];
-    const std::vector<index_t> strideXY    = arg_parser.long_opts["strideXY"];
-    const std::vector<index_t> strideGamma = arg_parser.long_opts["strideGamma"];
-    const std::vector<index_t> strideBeta  = arg_parser.long_opts["strideBeta"];
    using F16          = ck::half_t;
    using F32          = float;
@@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[])
    if(data_type == ck::DataTypeEnum::Half)
    {
-        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(do_verification,
+        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(
-                                                                            init_method,
+            do_verification, init_method, do_log, time_kernel, length);
-                                                                            do_log,
-                                                                            time_kernel,
-                                                                            length,
-                                                                            strideXY,
-                                                                            strideGamma,
-                                                                            strideBeta);
    }
    else if(data_type == ck::DataTypeEnum::Float)
    {
-        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(do_verification,
+        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(
-                                                                            init_method,
+            do_verification, init_method, do_log, time_kernel, length);
-                                                                            do_log,
-                                                                            time_kernel,
-                                                                            length,
-                                                                            strideXY,
-                                                                            strideGamma,
-                                                                            strideBeta);
    }
    else
    {

--- a/profiler/src/profile_normalization.cpp
+++ b/profiler/src/profile_normalization.cpp
@@ -5,17 +5,13 @@
 #include <vector>
 #include <unordered_map>
-#include "profiler/include/profile_normalization_impl.hpp"
+#include "profiler/include/profile_softmax_impl.hpp"
 using ck::index_t;
-using ck::profiler::NormDataType;
+using ck::profiler::SoftmaxDataType;
-using ck::profiler::NormType;
 struct ArgParser
 {
-    std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
-                                                           {"softmax", NormType::SOFTMAX}};
    std::unordered_map<std::string, std::vector<int>> long_opts = {
        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
@@ -50,7 +46,7 @@ struct ArgParser
 void print_help()
 {
-    std::cout << "arg1: tensor operation (batchnorm/softmax)\n"
+    std::cout << "arg1: tensor operation (softmax)\n"
              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
              << "arg3: verification (0: no; 1: yes)\n"
              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -64,7 +60,7 @@ void print_help()
              << std::endl;
 }
-int profile_normalization(int argc, char* argv[])
+int profile_softmax(int argc, char* argv[])
 {
    if(argc <= 2)
    {
@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
    ArgParser arg_parser;
    // short unnamed options
-    const NormType norm_type     = arg_parser.norm_dict[argv[1]];
+    const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
-    const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
-    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
-    const int init_method        = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
-    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel          = std::stoi(argv[6]);
-    const bool time_kernel       = std::stoi(argv[6]);
    // parse the long options
    arg_parser(argc, argv);
@@ -91,68 +86,64 @@ int profile_normalization(int argc, char* argv[])
        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+    // Rank 3
    if(length.size() == 3)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 3>(
+            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
-                do_verification,
+                                                                                 init_method,
-                init_method,
+                                                                                 do_log,
-                do_log,
+                                                                                 time_kernel,
-                time_kernel,
+                                                                                 length,
-                length,
+                                                                                 stride,
-                stride,
+                                                                                 reduce,
-                reduce,
+                                                                                 float(alpha),
-                float(alpha),
+                                                                                 float(beta));
-                float(beta),
-                norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_normalization_impl<float, float, float, 3>(do_verification,
+            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
-                                                                             init_method,
+                                                                       init_method,
-                                                                             do_log,
+                                                                       do_log,
-                                                                             time_kernel,
+                                                                       time_kernel,
-                                                                             length,
+                                                                       length,
-                                                                             stride,
+                                                                       stride,
-                                                                             reduce,
+                                                                       reduce,
-                                                                             float(alpha),
+                                                                       float(alpha),
-                                                                             float(beta),
+                                                                       float(beta));
-                                                                             norm_type);
        }
        else
        {
            throw std::runtime_error("not implemented yet");
        }
    }
+    // Rank 4
    else if(length.size() == 4)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 4>(
+            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
-                do_verification,
+                                                                                 init_method,
-                init_method,
+                                                                                 do_log,
-                do_log,
+                                                                                 time_kernel,
-                time_kernel,
+                                                                                 length,
-                length,
+                                                                                 stride,
-                stride,
+                                                                                 reduce,
-                reduce,
+                                                                                 float(alpha),
-                float(alpha),
+                                                                                 float(beta));
-                float(beta),
-                norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_normalization_impl<float, float, float, 4>(do_verification,
+            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
-                                                                             init_method,
+                                                                       init_method,
-                                                                             do_log,
+                                                                       do_log,
-                                                                             time_kernel,
+                                                                       time_kernel,
-                                                                             length,
+                                                                       length,
-                                                                             stride,
+                                                                       stride,
-                                                                             reduce,
+                                                                       reduce,
-                                                                             float(alpha),
+                                                                       float(alpha),
-                                                                             float(beta),
+                                                                       float(beta));
-                                                                             norm_type);
        }
        else
        {

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_bwd_data(int, char*[]);
-int profile_conv_bwd_weight(int, char*[]);
 int profile_grouped_conv_fwd(int, char*[]);
-int profile_normalization(int, char*[]);
+int profile_grouped_conv_bwd_weight(int, char*[]);
+int profile_softmax(int, char*[]);
 int profile_layernorm(int, char*[]);
 int profile_groupnorm(int, char*[]);
 int profile_reduce(int, char*[]);
@@ -43,8 +43,9 @@ static void print_helper_message()
           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        conv_bwd_weight: Convolution Backward Weight\n"
           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
+           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
+           "                        softmax: Softmax\n"
           "                        reduce: Reduce\n");
    // clang-format on
 }
@@ -117,21 +118,21 @@ int main(int argc, char* argv[])
    {
        return profile_conv_bwd_data(argc, argv);
    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_conv_bwd_weight(argc, argv);
-    }
    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
    {
        return profile_grouped_conv_fwd(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
+    {
+        return profile_grouped_conv_bwd_weight(argc, argv);
+    }
    else if(strcmp(argv[1], "reduce") == 0)
    {
        return profile_reduce(argc, argv);
    }
-    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0)
+    else if(strcmp(argv[1], "softmax") == 0)
    {
-        return profile_normalization(argc, argv);
+        return profile_softmax(argc, argv);
    }
    else if(strcmp(argv[1], "layernorm") == 0)
    {

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                      \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -81,7 +81,7 @@ def parse_logfile(logfile):
    StrideA=[]
    StrideB=[]
    StrideC=[]
-    if 'perf_gemm' in logfile:
+    if 'perf_gemm.log' in logfile:
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
@@ -120,14 +120,14 @@ def parse_logfile(logfile):
        res = [x for _,x in sorted(zip(tests,tflops))]
        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
        test_list=list(range(1,len(tests)+1))
-    #parse conv_fwd performance tests:
+    #parse conv_fwd and conv_bwd performance tests:
-    elif 'conv_fwd' in logfile:
+    elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
        for line in open(logfile):
            if 'tflops:' in line:
                lst=line.split()
                res.append(lst[1])
    #parse all other performance tests:
-    elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'conv_bwd_data' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile:
+    elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile  or 'gemm_bilinear' in logfile or 'reduction' in logfile:
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
@@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_
    df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime'])
    df_add=pd.DataFrame(data=[test_results],columns=testlist)
    df=pd.concat([df,df_add],axis=1)
-    print("new test results dataframe:",df)
+    #print("new test results dataframe:",df)
    df.to_sql(table_name,connection,if_exists='append',index=False)
    return 0
@@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist):
                print("test # ",i,"shows regression by {:.3f}%".format(
                    (float(test[i])-base_list[i])/base_list[i]*100))
                regression=1
-            ave_perf=ave_perf+float(test[i])/base_list[i]
+            if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
        if regression==0:
            print("no regressions found")
        ave_perf=ave_perf/len(base_list)
@@ -248,7 +248,7 @@ def main():
        conn = sqlEngine.connect()
        #save gemm performance tests:
-        if 'perf_gemm' in filename:
+        if 'perf_gemm.log' in filename:
            #write the ck_gemm_test_params table only needed once the test set changes
            #post_test_params(test_list,conn)
            for i in range(1,len(results)+1):

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,11 +6,10 @@ include(googletest)
 add_custom_target(tests)
 function(add_test_executable TEST_NAME)
    message("adding test ${TEST_NAME}")
    add_executable(${TEST_NAME} ${ARGN})
-    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
@@ -23,14 +22,14 @@ function(add_gtest_executable TEST_NAME)
    add_executable(${TEST_NAME} ${ARGN})
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
    # suppress gtest warnings
    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
-    gtest_discover_tests(${TEST_NAME})
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
 endfunction(add_gtest_executable TEST_NAME)
 add_subdirectory(magic_number_division)
 add_subdirectory(space_filling_curve)
 add_subdirectory(conv_util)
@@ -42,14 +41,15 @@ add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
 add_subdirectory(batched_gemm_gemm)
 add_subdirectory(batched_gemm_softmax_gemm)
-add_subdirectory(batched_gemm_masking_scale_softmax_gemm_permute)
+add_subdirectory(batched_gemm_softmax_gemm_permute)
 add_subdirectory(grouped_gemm)
 add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
-add_subdirectory(convnd_bwd_weight)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_convnd_fwd)
+add_subdirectory(grouped_convnd_bwd_weight)
 add_subdirectory(block_to_ctile_map)
 add_subdirectory(softmax)
-add_subdirectory(layernorm)
+add_subdirectory(normalization)
 add_subdirectory(data_type)
+add_subdirectory(elementwise_normalization)
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -2,3 +2,14 @@ add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
 target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
 target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
+target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
+add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
+target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
+target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
+add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "profiler/include/profile_batched_gemm_impl.hpp"
+namespace {
+using ADataType = ck::bhalf_t;
+using BDataType = ck::bhalf_t;
+using CDataType = ck::bhalf_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+int main()
+{
+    int M          = 256;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+    bool pass = true;
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include "profiler/include/profile_batched_gemm_impl.hpp"
+namespace {
+using ADataType = float;
+using BDataType = float;
+using CDataType = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+int main()
+{
+    int M          = 256;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+    bool pass = true;
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}