Merge branch 'develop' into modified_grouped_gemm_addressing_method

f5de8b57 · Chao Liu · GitHub · e83c7061 · fa9a0a5c · f5de8b57
Unverified Commit f5de8b57 authored Jun 30, 2022 by Chao Liu Committed by GitHub Jun 30, 2022
16 changed files
--- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+# device_normalization_instance
+set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
+    device_softmax_f32_f32_instance.cpp
+    device_softmax_f16_f16_instance.cpp
+)
+add_library(device_normalization_instance OBJECT ${DEVICE_NORMALIZATION_INSTANCE_SOURCE})
+set_target_properties(device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+clang_tidy_check(device_normalization_instance)
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+using F16 = ck::half_t;
+using F32 = float;
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
+    // clang-format on
+    >;
+void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
+}
+void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
+}
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+using F32 = float;
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
+    // clang-format on
+    >;
+void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
+}
+void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
+}
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -22,6 +22,7 @@ set(PROFILER_SOURCE
    src/profile_conv_bwd_weight.cpp
    src/profile_batched_gemm_reduce.cpp
    src/profile_gemm_add_add_fastgelu.cpp
+    src/profile_normalization.cpp
 )
 add_executable(ckProfiler ${PROFILER_SOURCE})
@@ -46,4 +47,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
+target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -6,7 +6,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -21,32 +21,28 @@ namespace tensor_operation {
 namespace device {
 namespace device_gemm_instance {
-using F32            = float;
+using F32                 = float;
-using F16            = ck::half_t;
+using F16                 = ck::half_t;
-using DPtrsGlobal    = ck::Tuple<F32*, F32*>;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
-using Identity       = ck::tensor_operation::element_wise::PassThrough;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
-using Square         = ck::tensor_operation::element_wise::UnarySquare;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
-using DInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
-using DOutElementOps = ck::Tuple<Identity, Identity>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
-using DeviceBatchedGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceBatchedGemmReducePtr<
+using DeviceGemmReduceNoOpPtr =
-    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    DInElementOps,
-    DOutElementOps>;
 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
-    std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
+    std::vector<DeviceGemmReduceNoOpPtr>&);
 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
-    std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
+    std::vector<DeviceGemmReduceNoOpPtr>&);
 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
-    std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
+    std::vector<DeviceGemmReduceNoOpPtr>&);
 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
-    std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
+    std::vector<DeviceGemmReduceNoOpPtr>&);
 } // namespace device_gemm_instance
 } // namespace device
@@ -59,7 +55,7 @@ namespace profiler {
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
-          typename DDataType,
+          typename ReduceDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
@@ -99,16 +95,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
    Tensor<CDataType> c_g_m_n_host_result(
        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
+    Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
+    Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_g_m_n_device_result(
        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
+    Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
+    Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
@@ -135,20 +131,23 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
    using AElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using BElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using CElementOp            = ck::tensor_operation::element_wise::PassThrough;
-    using D0ReduceOp            = ck::reduce::Add;
+    using ReduceOp0             = ck::reduce::Add;
-    using D1ReduceOp            = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
-    using DxsInElementOps       = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
-    using DxsOutElementOps      = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
-    const auto a_element_op       = AElementOp{};
+    auto a_element_op                     = AElementOp{};
-    const auto b_element_op       = BElementOp{};
+    auto b_element_op                     = BElementOp{};
-    const auto c_element_op       = CElementOp{};
+    auto c_element_op                     = CElementOp{};
-    const auto dxs_in_element_op  = DxsInElementOps{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
-    const auto dxs_out_element_op = DxsOutElementOps{};
-    const auto d0_reduce_op       = D0ReduceOp{};
+    const auto reduce0_op = ReduceOp0{};
-    const auto d1_reduce_op       = D1ReduceOp{};
+    const auto reduce1_op = ReduceOp1{};
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&passthrough, &passthrough};
    if(do_verification)
    {
@@ -160,6 +159,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
                                                             BElementOp,
                                                             CElementOp>;
+        using ReduceAccDataType = ReduceDataType;
        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
@@ -172,21 +173,22 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        {
            for(int m = 0; m < M; ++m)
            {
-                float d0_acc = d0_reduce_op.GetIdentityValue<float>();
+                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
-                float d1_acc = d1_reduce_op.GetIdentityValue<float>();
+                auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
                for(int n = 0; n < N; ++n)
                {
-                    float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
+                    ReduceAccDataType d0_val =
-                    float d1_val;
+                        ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
+                    ReduceAccDataType d1_val;
-                    UnarySquareElementOp{}(d1_val, d0_val);
+                    square(d1_val, d0_val);
-                    d0_reduce_op(d0_acc, d0_val);
+                    reduce0_op(reduce0_acc, d0_val);
-                    d1_reduce_op(d1_acc, d1_val);
+                    reduce1_op(reduce1_acc, d1_val);
                }
-                d0_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d0_acc);
+                d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
-                d1_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d1_acc);
+                d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
            }
        }
    }
@@ -194,17 +196,19 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpace());
-    DeviceMem d0_device_buf(sizeof(DDataType) * d0_g_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
-    DeviceMem d1_device_buf(sizeof(DDataType) * d1_g_m_device_result.mDesc.GetElementSpace());
+                                 d0_g_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 d1_g_m_device_result.mDesc.GetElementSpace());
-    auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
-                                     static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()));
+                                      reduce1_device_buf.GetDeviceBuffer()};
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());
    // add device GEMM instances
-    std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceBatchedGemmReduceNoOpPtr>
+    std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmReduceNoOpPtr>
        gemm_ptrs;
    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
@@ -257,31 +261,32 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
-        auto argument_ptr =
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
-            gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                                          b_device_buf.GetDeviceBuffer(),
-                                          static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                                          nullptr,
-                                          static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                                          {},
-                                          &dxs_global,
+                                                          c_device_buf.GetDeviceBuffer(),
-                                          M,
+                                                          p_reduces,
-                                          N,
+                                                          M,
-                                          K,
+                                                          N,
-                                          StrideA,
+                                                          K,
-                                          StrideB,
+                                                          StrideA,
-                                          StrideC,
+                                                          StrideB,
-                                          a_element_op,
+                                                          StrideC,
-                                          b_element_op,
+                                                          {},
-                                          c_element_op,
+                                                          gemm_element_ops,
-                                          dxs_in_element_op,
+                                                          {},
-                                          dxs_out_element_op,
+                                                          reduce_in_element_ops,
-                                          BatchCount);
+                                                          reduce_out_element_ops,
+                                                          BatchCount);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // init DO, D1 to 0
-            d0_device_buf.SetZero();
+            reduce0_device_buf.SetZero();
-            d1_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
@@ -311,8 +316,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
            if(do_verification)
            {
                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
-                d0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
+                reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
-                d1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
                float c_error  = check_error(c_g_m_n_host_result, c_g_m_n_device_result);
                float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result);

--- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
@@ -21,33 +21,28 @@ namespace tensor_operation {
 namespace device {
 namespace device_gemm_instance {
-using F32            = float;
+using F32                 = float;
-using F16            = ck::half_t;
+using F16                 = ck::half_t;
-using DPtrsGlobal    = ck::Tuple<F32*, F32*>;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
-using Div            = ck::tensor_operation::element_wise::UnaryDivide;
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
-using Identity       = ck::tensor_operation::element_wise::PassThrough;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
-using Square         = ck::tensor_operation::element_wise::UnarySquare;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
-using DInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
-using DOutElementOps = ck::Tuple<Div, Div>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
-using DeviceGemmBiasAddReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmBiasAddReducePtr<
+using DeviceGemmBiasAddReduceNoOpPtr =
-    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::device::DeviceGemmReducePtr<1, ReducePtrsGlobal::Size()>;
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
-    ck::tensor_operation::element_wise::PassThrough,
-    DInElementOps,
-    DOutElementOps>;
-void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
-void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
-void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
-void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
 } // namespace device_gemm_instance
@@ -61,9 +56,9 @@ namespace profiler {
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
-          typename C0DataType,
+          typename BiasDataType,
-          typename C1DataType,
+          typename D0DataType,
-          typename DDataType,
+          typename ReduceDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
@@ -77,7 +72,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                                       int StrideA,
                                       int StrideB,
                                       int StrideC,
-                                       int StrideC1)
+                                       int StrideD0)
 {
    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
        return HostTensorDescriptor(std::vector<std::size_t>({len}),
@@ -102,24 +97,24 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<C0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
-    Tensor<C1DataType> c1_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_m_host_result(
+    Tensor<ReduceDataType> reduce0_m_host_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_m_host_result(
+    Tensor<ReduceDataType> reduce1_m_host_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_m_device_result(
+    Tensor<ReduceDataType> reduce0_m_device_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_m_device_result(
+    Tensor<ReduceDataType> reduce1_m_device_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
-    std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
    std::size_t num_thread = 1;
    switch(init_method)
@@ -130,50 +125,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
        bias_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
-        c1_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
        break;
    default:
        std::srand(0);
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
        bias_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5}, num_thread);
-        c1_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
    }
    using PassThrough           = ck::tensor_operation::element_wise::PassThrough;
    using AElementOp            = PassThrough;
    using BElementOp            = PassThrough;
    using CElementOp            = PassThrough;
-    using C1ElementOp           = PassThrough;
+    using D0ElementOp           = PassThrough;
-    using D0ReduceOp            = ck::reduce::Add;
+    using ReduceOp0             = ck::reduce::Add;
-    using D1ReduceOp            = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
    using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
-    using DxsInElementOps       = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
-    using DxsOutElementOps      = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
-    const auto a_element_op  = AElementOp{};
+    auto a_element_op                     = AElementOp{};
-    const auto b_element_op  = BElementOp{};
+    auto b_element_op                     = BElementOp{};
-    const auto c_element_op  = CElementOp{};
+    auto c_element_op                     = CElementOp{};
-    const auto c1_element_op = C1ElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
-    const auto d0_reduce_op  = D0ReduceOp{};
-    const auto d1_reduce_op  = D1ReduceOp{};
-    auto dxs_in_element_op  = DxsInElementOps{};
+    auto d0_element_op    = D0ElementOp{};
-    auto dxs_out_element_op = DxsOutElementOps{N, N};
+    const auto reduce0_op = ReduceOp0{};
+    const auto reduce1_op = ReduceOp1{};
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
    if(do_verification)
    {
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
                                                                                CDataType,
-                                                                                DDataType,
+                                                                                ReduceDataType,
                                                                                AElementOp,
                                                                                BElementOp,
                                                                                CElementOp>;
-        using ReduceAccDataType = DDataType;
+        using ReduceAccDataType = ReduceDataType;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -189,53 +187,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                ReduceAccDataType acc = static_cast<ReduceAccDataType>(c_m_n_host_result(m, n)) +
                                        static_cast<ReduceAccDataType>(bias_n(n));
-                ReduceAccDataType c1 = static_cast<ReduceAccDataType>(c1_m_n(m, n));
+                ReduceAccDataType d0 = static_cast<ReduceAccDataType>(d0_m_n(m, n));
                c_element_op(acc, acc);
-                c1_element_op(c1, c1);
+                d0_element_op(d0, d0);
-                acc += c1;
+                acc += d0;
                c_m_n_host_result(m, n) = static_cast<CDataType>(acc);
            }
        for(int m = 0; m < M; ++m)
        {
-            auto d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
-            auto d1_acc = d1_reduce_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
            for(int n = 0; n < N; ++n)
            {
-                ReduceAccDataType c_val =
+                ReduceAccDataType d0_val =
                    ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
-                ReduceAccDataType d0_val;
                ReduceAccDataType d1_val;
-                dxs_in_element_op(ck::Number<0>{})(d0_val, c_val);
+                square(d1_val, d0_val);
-                dxs_in_element_op(ck::Number<1>{})(d1_val, c_val);
+                reduce0_op(reduce0_acc, d0_val);
-                d0_reduce_op(d0_acc, d0_val);
+                reduce1_op(reduce1_acc, d1_val);
-                d1_reduce_op(d1_acc, d1_val);
            }
-            dxs_out_element_op(ck::Number<0>{})(d0_acc, d0_acc);
+            div(reduce0_acc, reduce0_acc);
-            dxs_out_element_op(ck::Number<1>{})(d1_acc, d1_acc);
+            div(reduce1_acc, reduce1_acc);
-            d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc);
+            reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
-            d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
+            reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
        }
    }
    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
-    DeviceMem bias_device_buf(sizeof(C0DataType) * bias_n.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpace());
-    DeviceMem c1_device_buf(sizeof(C1DataType) * c1_m_n.mDesc.GetElementSpace());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
-    DeviceMem d0_device_buf(sizeof(DDataType) * d0_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
-    DeviceMem d1_device_buf(sizeof(DDataType) * d1_m_device_result.mDesc.GetElementSpace());
+                                 reduce0_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 reduce1_m_device_result.mDesc.GetElementSpace());
-    auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
-                                     static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()));
+                                      reduce1_device_buf.GetDeviceBuffer()};
    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
    bias_device_buf.ToDevice(bias_n.mData.data());
-    c1_device_buf.ToDevice(c1_m_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
    // add device GEMM instances
    std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmBiasAddReduceNoOpPtr>
@@ -249,7 +247,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
@@ -257,7 +255,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
@@ -265,7 +263,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
@@ -273,7 +271,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
                    gemm_ptrs);
        }
    }
@@ -291,34 +289,31 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
-        auto argument_ptr = gemm_ptr->MakeArgumentPointer(
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
-            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                                          b_device_buf.GetDeviceBuffer(),
-            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                                          bias_device_buf.GetDeviceBuffer(),
-            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                                          {d0_device_buf.GetDeviceBuffer()},
-            static_cast<C0DataType*>(bias_device_buf.GetDeviceBuffer()),
+                                                          c_device_buf.GetDeviceBuffer(),
-            static_cast<C1DataType*>(c1_device_buf.GetDeviceBuffer()),
+                                                          p_reduces,
-            &dxs_global,
+                                                          M,
-            M,
+                                                          N,
-            N,
+                                                          K,
-            K,
+                                                          StrideA,
-            StrideA,
+                                                          StrideB,
-            StrideB,
+                                                          StrideC,
-            StrideC,
+                                                          {StrideD0},
-            StrideC1,
+                                                          gemm_element_ops,
-            a_element_op,
+                                                          {&d0_element_op},
-            b_element_op,
+                                                          reduce_in_element_ops,
-            c_element_op,
+                                                          reduce_out_element_ops);
-            c1_element_op,
-            dxs_in_element_op,
-            dxs_out_element_op);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // init DO, D1 to 0
-            d0_device_buf.SetZero();
+            reduce0_device_buf.SetZero();
-            d1_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
@@ -328,9 +323,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
            std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
            std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                   sizeof(CDataType) * M * N + sizeof(C0DataType) * M * N +
+                                   sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
-                                   sizeof(C1DataType) * M * N + sizeof(DDataType) * M +
+                                   sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
-                                   sizeof(DDataType) * M;
+                                   sizeof(ReduceDataType) * M;
            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -350,12 +345,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
            if(do_verification)
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                d0_device_buf.FromDevice(d0_m_device_result.mData.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
-                d1_device_buf.FromDevice(d1_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
-                ck::utils::check_err(d0_m_device_result.mData, d0_m_host_result.mData);
+                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
-                ck::utils::check_err(d1_m_device_result.mData, d1_m_host_result.mData);
+                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
                if(do_log)
                {
@@ -365,13 +360,17 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                        << std::endl;
                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d0_host: ", d0_m_host_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d0_device: ", d0_m_device_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d1_host: ", d1_m_host_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d1_device: ", d1_m_device_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -21,21 +21,17 @@ namespace tensor_operation {
 namespace device {
 namespace device_gemm_instance {
-using F32            = float;
+using F32                 = float;
-using F16            = ck::half_t;
+using F16                 = ck::half_t;
-using DPtrsGlobal    = ck::Tuple<F32*, F32*>;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
-using Div            = ck::tensor_operation::element_wise::UnaryDivide;
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
-using Identity       = ck::tensor_operation::element_wise::PassThrough;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
-using Square         = ck::tensor_operation::element_wise::UnarySquare;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
-using DInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
-using DOutElementOps = ck::Tuple<Div, Div>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
-using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePtr<
+using DeviceGemmReduceNoOpPtr =
-    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    DInElementOps,
-    DOutElementOps>;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
    std::vector<DeviceGemmReduceNoOpPtr>&);
@@ -60,7 +56,7 @@ namespace profiler {
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
-          typename DDataType,
+          typename ReduceDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
@@ -95,22 +91,22 @@ bool profile_gemm_reduce_impl(int do_verification,
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_m_host_result(
+    Tensor<ReduceDataType> reduce0_m_host_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_m_host_result(
+    Tensor<ReduceDataType> reduce1_m_host_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<DDataType> d0_m_device_result(
+    Tensor<ReduceDataType> reduce0_m_device_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
-    Tensor<DDataType> d1_m_device_result(
+    Tensor<ReduceDataType> reduce1_m_device_result(
        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
-    std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
    std::size_t num_thread = 1;
    switch(init_method)
@@ -130,34 +126,37 @@ bool profile_gemm_reduce_impl(int do_verification,
    using AElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using BElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using CElementOp            = ck::tensor_operation::element_wise::PassThrough;
-    using D0ReduceOp            = ck::reduce::Add;
+    using ReduceOp0             = ck::reduce::Add;
-    using D1ReduceOp            = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
-    using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
-    using DxsInElementOps       = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+    using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
-    using DxsOutElementOps      = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
-    const auto a_element_op = AElementOp{};
+    const auto reduce0_op = ReduceOp0{};
-    const auto b_element_op = BElementOp{};
+    const auto reduce1_op = ReduceOp1{};
-    const auto c_element_op = CElementOp{};
-    const auto d0_reduce_op = D0ReduceOp{};
-    const auto d1_reduce_op = D1ReduceOp{};
-    auto dxs_in_element_op  = DxsInElementOps{};
+    auto passthrough                            = UnaryIdenticElementOp{};
-    auto dxs_out_element_op = DxsOutElementOps{N, N};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
    if(do_verification)
    {
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
                                                                                CDataType,
-                                                                                DDataType,
+                                                                                ReduceDataType,
                                                                                AElementOp,
                                                                                BElementOp,
                                                                                CElementOp>;
-        using ReduceAccDataType = DDataType;
+        using ReduceAccDataType = ReduceDataType;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();
@@ -169,37 +168,37 @@ bool profile_gemm_reduce_impl(int do_verification,
        for(int m = 0; m < M; ++m)
        {
-            auto d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
-            auto d1_acc = d1_reduce_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
            for(int n = 0; n < N; ++n)
            {
-                ReduceAccDataType c_val =
+                ReduceAccDataType d0_val =
                    ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
-                ReduceAccDataType d0_val;
                ReduceAccDataType d1_val;
-                dxs_in_element_op(ck::Number<0>{})(d0_val, c_val);
+                square(d1_val, d0_val);
-                dxs_in_element_op(ck::Number<1>{})(d1_val, c_val);
+                reduce0_op(reduce0_acc, d0_val);
-                d0_reduce_op(d0_acc, d0_val);
+                reduce1_op(reduce1_acc, d1_val);
-                d1_reduce_op(d1_acc, d1_val);
            }
-            dxs_out_element_op(ck::Number<0>{})(d0_acc, d0_acc);
+            div(reduce0_acc, reduce0_acc);
-            dxs_out_element_op(ck::Number<1>{})(d1_acc, d1_acc);
+            div(reduce1_acc, reduce1_acc);
-            d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc);
+            reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
-            d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
+            reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
        }
    }
    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
-    DeviceMem d0_device_buf(sizeof(DDataType) * d0_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
-    DeviceMem d1_device_buf(sizeof(DDataType) * d1_m_device_result.mDesc.GetElementSpace());
+                                 reduce0_m_device_result.mDesc.GetElementSpace());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 reduce1_m_device_result.mDesc.GetElementSpace());
-    auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
-                                     static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()));
+                                      reduce1_device_buf.GetDeviceBuffer()};
    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
@@ -258,30 +257,31 @@ bool profile_gemm_reduce_impl(int do_verification,
    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
-        auto argument_ptr =
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
-            gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                                          b_device_buf.GetDeviceBuffer(),
-                                          static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                                          nullptr,
-                                          static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                                          {},
-                                          &dxs_global,
+                                                          c_device_buf.GetDeviceBuffer(),
-                                          M,
+                                                          p_reduces,
-                                          N,
+                                                          M,
-                                          K,
+                                                          N,
-                                          StrideA,
+                                                          K,
-                                          StrideB,
+                                                          StrideA,
-                                          StrideC,
+                                                          StrideB,
-                                          a_element_op,
+                                                          StrideC,
-                                          b_element_op,
+                                                          {},
-                                          c_element_op,
+                                                          gemm_element_ops,
-                                          dxs_in_element_op,
+                                                          {},
-                                          dxs_out_element_op);
+                                                          reduce_in_element_ops,
+                                                          reduce_out_element_ops);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // init DO, D1 to 0
-            d0_device_buf.SetZero();
+            reduce0_device_buf.SetZero();
-            d1_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
@@ -311,12 +311,12 @@ bool profile_gemm_reduce_impl(int do_verification,
            if(do_verification)
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                d0_device_buf.FromDevice(d0_m_device_result.mData.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
-                d1_device_buf.FromDevice(d1_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
-                ck::utils::check_err(d0_m_device_result.mData, d0_m_host_result.mData);
+                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
-                ck::utils::check_err(d1_m_device_result.mData, d1_m_host_result.mData);
+                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
                if(do_log)
                {
@@ -326,13 +326,17 @@ bool profile_gemm_reduce_impl(int do_verification,
                        << std::endl;
                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d0_host: ", d0_m_host_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d0_device: ", d0_m_device_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d1_host: ", d1_m_host_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "d1_device: ", d1_m_device_result.mData, ",")
+                    LogRangeAsType<float>(
+                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
                        << std::endl;
                }
            }

--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -232,6 +232,10 @@ void profile_grouped_gemm_impl(int do_verification,
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            std::string gemm_name = gemm_ptr->GetTypeString();

--- a/profiler/include/profile_normalization_impl.hpp
+++ b/profiler/include/profile_normalization_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+enum struct NormType
+{
+    LAYERNORM,
+    BATCHNORM,
+    SOFTMAX,
+};
+enum struct NormDataType
+{
+    F32_F32, // in, out
+    F16_F16,
+    BF16_BF16,
+    INT8_INT8,
+};
+// clang-format off
+template <typename NormDataType> std::string type_to_string();
+template <> std::string type_to_string<float>()   { return "f32"; }
+template <> std::string type_to_string<half_t>()  { return "f16"; }
+template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
+template <> std::string type_to_string<int8_t>()  { return "int8"; }
+template <> std::string type_to_string<int32_t>() { return "int32"; }
+// clang-format on
+template <typename InDataType, typename AccDataType, typename OutDataType>
+void profile_normalization_impl(int do_verification,
+                                int init_method,
+                                bool do_log,
+                                bool time_kernel,
+                                std::vector<index_t> in_length,
+                                std::vector<index_t> in_strides,
+                                std::vector<index_t> reduce_dims,
+                                AccDataType alpha,
+                                AccDataType beta,
+                                NormType norm_type)
+{
+    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
+                                               : Tensor<InDataType>(in_length, in_strides);
+    Tensor<OutDataType> out(in.mDesc);
+    switch(init_method)
+    {
+    // case 0: break;
+    case 0:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+    Tensor<OutDataType> out_ref(out);
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+    in_dev.ToDevice(in.mData.data());
+    out_dev.ToDevice(out.mData.data());
+    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
+    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
+    // add device normalization instances
+    std::vector<tensor_operation::device::DeviceNormalizationPtr> instances;
+    if(norm_type == NormType::SOFTMAX)
+    {
+        if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
+                     is_same<AccDataType, float>::value)
+        {
+            if(in_length.size() == 3)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f16_f16_rank3_instances(instances);
+            if(in_length.size() == 4)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f16_f16_rank4_instances(instances);
+        }
+        else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
+                          is_same<AccDataType, float>::value)
+        {
+            if(in_length.size() == 3)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f32_f32_rank3_instances(instances);
+            if(in_length.size() == 4)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f32_f32_rank4_instances(instances);
+        }
+    }
+    if(instances.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device normalization instance found");
+    }
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    for(auto& inst_ptr : instances)
+    {
+        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
+        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
+        if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) &&
+             inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
+        {
+            continue;
+        }
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths,
+                                                          i_in_strides,
+                                                          reduce_dims,
+                                                          &alpha,
+                                                          &beta,
+                                                          in_dev.GetDeviceBuffer(),
+                                                          out_dev.GetDeviceBuffer());
+        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+            LogRange(std::cout << "input lengths = [", in_length, ", ")
+                << "], "
+                << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
+        }
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes =
+            in.mDesc.GetElementSize() * sizeof(InDataType) +
+            (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                  << inst_ptr->GetTypeString() << std::endl;
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            // TODO: factory method to dynamically switch between different reference normalizations
+            using ReferenceFactory =
+                tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+            ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
+            out_dev.FromDevice(out.mData.data());
+            bool pass;
+            if(std::is_same<InDataType, int8_t>::value)
+            {
+                pass = ck::utils::check_err(
+                    out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                if(do_log)
+                {
+                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<int>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+            else
+            {
+                pass = ck::utils::check_err(out.mData, out_ref.mData);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "input lengths = [", in_length, ", ")
+                    << "], "
+                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            }
+        }
+    }
+    std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
+              << type_to_string<OutDataType>() << ", ";
+    LogRange(std::cout << "length = ", i_in_lengths, ",") << ", ";
+    LogRange(std::cout << "stride = ", i_in_strides, ",") << ", ";
+    LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
+    std::cout << "alpha = " << alpha << ", "
+              << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+              << " GB/s, " << best_instance_name << std::endl;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/profile_normalization.cpp
+++ b/profiler/src/profile_normalization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include "profiler/include/profile_normalization_impl.hpp"
+using ck::index_t;
+using ck::profiler::NormDataType;
+using ck::profiler::NormType;
+struct ArgParser
+{
+    std::unordered_map<std::string, NormType> norm_dict = {{"layernorm", NormType::LAYERNORM},
+                                                           {"batchnorm", NormType::BATCHNORM},
+                                                           {"softmax", NormType::SOFTMAX}};
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+void print_help()
+{
+    std::cout << "arg1: tensor operation (layernorm/batchnorm/softmax)\n"
+              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+              << "arg3: verification (0: no; 1: yes)\n"
+              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg5: print tensor value (0: no; 1: yes)\n"
+              << "arg6: time kernel (0=n0, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 8 4 256) \n"
+              << "--stride: tensor strides (e.g, --stride 1024 256 1)\n"
+              << "--reduce: to-reduce dimensions (e.g, --reduce 2)\n"
+              << "--alpha: alpha scaling value\n"
+              << "--beta: beta scaling value\n"
+              << std::endl;
+}
+int profile_normalization(int argc, char* argv[])
+{
+    if(argc <= 2)
+    {
+        print_help();
+        return 0;
+    }
+    ArgParser arg_parser;
+    // short unnamed options
+    const NormType norm_type     = arg_parser.norm_dict[argv[1]];
+    const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
+    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method        = std::stoi(argv[4]);
+    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel       = std::stoi(argv[6]);
+    // parse the long options
+    arg_parser(argc, argv);
+    const std::vector<index_t> length = arg_parser.long_opts["length"];
+    const std::vector<index_t> stride = arg_parser.long_opts["stride"];
+    const std::vector<index_t> reduce = arg_parser.long_opts["reduce"];
+    const index_t alpha =
+        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
+    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+    if(data_type == NormDataType::F16_F16)
+    {
+        ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t>(do_verification,
+                                                                                init_method,
+                                                                                do_log,
+                                                                                time_kernel,
+                                                                                length,
+                                                                                stride,
+                                                                                reduce,
+                                                                                float(alpha),
+                                                                                float(beta),
+                                                                                norm_type);
+    }
+    else if(data_type == NormDataType::F32_F32)
+    {
+        ck::profiler::profile_normalization_impl<float, float, float>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      time_kernel,
+                                                                      length,
+                                                                      stride,
+                                                                      reduce,
+                                                                      float(alpha),
+                                                                      float(beta),
+                                                                      norm_type);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+    return 0;
+}
+// hijack main() for quick debugging
+// int main(int argc, char* argv[])
+// {
+//     profile_normalization(argc, argv);
+//     return 0;
+// }
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -20,6 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_convnd_fwd(int argc, char* argv[]);
 int profile_convnd_bwd_data(int, char*[], int);
 int profile_conv_bwd_weight(int, char*[]);
+int profile_normalization(int, char*[]);
 int profile_reduce(int, char*[]);
 static void print_helper_message()
@@ -130,6 +131,11 @@ int main(int argc, char* argv[])
    {
        return profile_gemm_add_add_fastgelu(argc, argv);
    }
+    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 ||
+            strcmp(argv[1], "softmax") == 0)
+    {
+        return profile_normalization(argc, argv);
+    }
    else
    {
        print_helper_message();

--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -2,7 +2,10 @@ add_custom_target(test_softmax)
 add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
 add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
+add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
 target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
 target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
+target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
 add_dependencies(test_softmax test_softmax_fp32)
 add_dependencies(test_softmax test_softmax_fp16)
\ No newline at end of file
+add_dependencies(test_softmax test_softmax_int8)
\ No newline at end of file
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);

--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);

--- a/test/softmax/test_softmax_int8.cpp
+++ b/test/softmax/test_softmax_int8.cpp
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+template <ck::index_t N>
+using I = ck::Number<N>;
+template <typename Tuple>
+class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
+TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <vector>
 #include <iostream>
 #include <gtest/gtest.h>
@@ -16,6 +18,18 @@
 namespace ck {
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
 template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
        auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
                                                                i_in_strides,
                                                                reduce_dims,
-                                                                alpha,
+                                                                &alpha,
-                                                                beta,
+                                                                &beta,
                                                                in_dev.GetDeviceBuffer(),
                                                                out_dev.GetDeviceBuffer());
        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
        {
-            FAIL() << "Unsupported argument";
+            // std::cout << "Skipped due to unsupported argument: "
+            //           << "input lengths = [" << serialize_range(in_length) << "], "
+            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
        }
        auto invoker_ptr = device_instance.MakeInvokerPointer();
        invoker_ptr->Run(argument_ptr.get());
-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, Rank, reduce_dims});
+        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});
        out_dev.FromDevice(out.mData.data());
-        EXPECT_TRUE(ck::utils::check_err(out.mData, out_ref.mData));
+        bool pass;
+        if(std::is_same<InDataType, int8_t>::value)
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
+        }
+        else
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
+        }
+        if(!pass)
+        {
+            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
+                   << "scaler = [" << alpha << ", " << beta << "].";
+        }
    }
    void Run()
@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
        {
            for(auto scale : this->scales_)
            {
-                this->RunSingle(in_length, std::get<0>(scale), std::get<1>(scale));
+                this->RunSingle(in_length, scale[0], scale[1]);
            }
        }
    }
-    std::vector<std::vector<index_t>> in_lengths_ = {{1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}};
+    std::vector<std::vector<index_t>> in_lengths_ = {
-    std::vector<std::tuple<AccDataType, AccDataType>> scales_ = {{1, 0}, {2, 2}, {0, 1}};
+        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
+    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};
    typename ReferenceInstance::Invoker ref_instance_invoker_;
 };