Merge remote-tracking branch 'origin/develop' into jakpias/pool1d_fwd

86f8ac01 · Jakub Piasecki · 3f6360d0 · ab60b390 · 3f6360d0 · 86f8ac01
Commit 86f8ac01 authored Aug 12, 2024 by Jakub Piasecki
18 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-using BF16 = ck::bhalf_t;
-using F16  = ck::half_t;
-using F32  = float;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using Empty_Tuple = ck::Tuple<>;
-using namespace ck::tensor_layout::convolution;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-static constexpr auto ConvFwd3x3 = ConvolutionForwardSpecialization::Filter3x3;
-static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
-template <index_t NDimSpatial,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
-using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple<
-    // clang-format off
-        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
-        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     Type|     Type|  Pipeline|   ToMerge|
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
-        // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
-    // clang-format on
-    >;
-template <index_t NDimSpatial,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
-using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple<
-    // clang-format off
-        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
-    // clang-format on
-    >;
-template <index_t NDimSpatial,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
-using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
-    // clang-format off
-        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 32>
-    // clang-format on
-    >;
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -18,7 +18,6 @@
 #ifdef CK_USE_XDL
 #include "grouped_convolution_forward_xdl.inc"
 #include "grouped_convolution_forward_xdl_large_tensor.inc"
-#include "grouped_convolution_forward_xdl_merged_groups.inc"
 #include "grouped_convolution_forward_comp_xdl.inc"
 #include "grouped_convolution_forward_mem_inter_xdl.inc"
 #include "grouped_convolution_forward_mem_intra_xdl.inc"
@@ -203,8 +202,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
                    op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
-                    op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
                    op_ptrs);
@@ -220,8 +217,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
                    op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
-                    op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
                    op_ptrs);
@@ -239,8 +234,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
                    op_ptrs);
-                add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
-                    op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(op_ptrs);
                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
                    op_ptrs);
@@ -300,8 +293,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-                    op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
                    op_ptrs);
@@ -358,8 +349,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-                    op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
                    op_ptrs);
@@ -377,8 +366,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                    op_ptrs);
-                add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-                    op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(op_ptrs);
                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
                    op_ptrs);

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// grouped conv2d forward, NHWGC/GKYXC/NHWGK
-#ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-#ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F32,
-                                                                F32,
-                                                                Empty_Tuple,
-                                                                F32,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-#ifdef CK_ENABLE_BF16
-// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-#ifdef CK_ENABLE_FP32
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F32,
-                                                                F32,
-                                                                Empty_Tuple,
-                                                                F32,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-#endif
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -14,11 +14,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-   # merged groups
-   # NHWGC, GKYXC, NHWGK
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
   #mem
   # NHWGC, GKYXC, NHWGK
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F32,
-                                                                F32,
-                                                                Empty_Tuple,
-                                                                F32,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -13,10 +13,6 @@ set(GROUPED_CONV3D_FWD
   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
   xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
-   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F32,
-                                                                F32,
-                                                                Empty_Tuple,
-                                                                F32,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwdDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                ConvFwd3x3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[36])
+    elif 'perf_fmha' in logfile:
+        for line in open(logfile):
+            if 'TFlops' in line:
+                lst=line.split()
+                line_dict=dict(zip(lst[1:],lst))
+                res.append(line_dict['TFlops,'])
    return res
@@ -304,6 +310,14 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_mixed_gemm_tflops"
+        if 'fmha_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_fwd_tflops"
+        if 'fmha_bwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_bwd_tflops"
        tflops_base = get_baseline(table_name,conn)
        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -13,3 +13,20 @@
 python3 process_perf_data.py perf_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
-#!/bin/bash
-## The following will be used for CI
-set -x
-## for float
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
-## for float64
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
-## for float16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
-## for int8_t
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
-## for bfloat16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
-set +x
--- a/test/reduce/CMakeLists.txt
+++ b/test/reduce/CMakeLists.txt
-add_test_executable(test_reduce_no_index reduce_no_index.cpp)
+add_gtest_executable(test_reduce_no_index reduce_no_index.cpp)
-add_test_executable(test_reduce_with_index reduce_with_index.cpp)
+add_gtest_executable(test_reduce_with_index reduce_with_index.cpp)
 target_link_libraries(test_reduce_no_index PRIVATE utility device_reduce_instance)
 target_link_libraries(test_reduce_with_index PRIVATE utility device_reduce_instance)
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <getopt.h>
 #include "ck/library/utility/host_common_util.hpp"
 #include "profiler/profile_reduce_impl.hpp"
+#include <gtest/gtest.h>
 using namespace ck;
-static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+struct ReduceParam
-                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+{
-                                       {"scales", required_argument, nullptr, 'S'},
+    bool do_verification{true};
-                                       {"help", no_argument, nullptr, '?'},
+    bool propagateNan{false};
-                                       {nullptr, 0, nullptr, 0}};
+    bool useIndex{false};
+    bool time_kernel{false};
+    bool do_dumpout{false};
+    int init_method{2};
+    float alpha{1.0f};
+    float beta{0.0f};
+    std::vector<size_t> inLengths{64, 4, 280, 82};
+    std::vector<int> reduceDims{0, 1, 2, 3};
+};
-class SimpleAppArgs
+std::vector<std::vector<int>> SetGenericReduceDim()
 {
-    private:
+    return {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 3}, {0, 2, 3}, {1, 2, 3}, {0}, {1}, {2}, {3}};
-    int option_index = 0;
+}
-    public:
+template <typename T>
-    std::vector<size_t> inLengths;
+class ReduceWithIndexTest : public ::testing::Test
-    std::vector<int> reduceDims;
+{
-    std::vector<float> scales;
+    protected:
+    using InDataType  = std::tuple_element_t<0, T>;
+    using AccDataType = std::tuple_element_t<1, T>;
+    using OutDataType = std::tuple_element_t<2, T>;
-    int data_type;
+    static std::vector<ReduceParam> params;
-    int init_method = 1;
-    public:
+    static void SetUpTestSuite()
-    void show_usage(const char* cmd)
    {
-        std::cout << "Usage of " << cmd << std::endl;
+        // set testcase variables
-        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+        ReduceParam set;
-                     "(only 4-d tensor supported)"
+        const auto setReduceDim = SetGenericReduceDim();
-                  << std::endl;
-        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
-                     "(only 1 or 3 or 4 dimensions supported)"
-                  << std::endl;
-        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
-                  << std::endl;
-        std::cout << "Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
-        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
-                     "value, 3=decimal value)"
-                  << std::endl;
-    };
-    int processArgs(int argc, char* argv[])
-    {
-        using ck::host_common::getTypeValuesFromString;
-        int ch;
-        while(1)
+        for(std::size_t i(0); i < setReduceDim.size(); ++i)
        {
-            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            set.reduceDims = setReduceDim[i];
-            if(ch == -1)
+            params.emplace_back(set);
-                break;
+        }
-            switch(ch)
+    }
+    template <ReduceTensorOp ReduceOpIdType>
+    void Run()
    {
-            case 'D':
+        for(auto param : this->params)
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                inLengths = getTypeValuesFromString<size_t>(optarg);
-                break;
-            case 'R':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                reduceDims = getTypeValuesFromString<int>(optarg);
-                break;
-            case 'S':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                scales = getTypeValuesFromString<float>(optarg);
-                break;
-            case '?':
-                if(std::string(long_options[option_index].name) == "help")
        {
-                    show_usage(argv[0]);
+            bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
-                    return (-1);
+                param.do_verification,
-                };
+                param.init_method,
-                break;
+                param.do_dumpout,
-            default: show_usage(argv[0]); return (-1);
+                param.time_kernel,
-            };
+                param.inLengths,
-        };
+                param.reduceDims,
+                ReduceOpIdType,
+                param.propagateNan,
+                param.useIndex,
+                param.alpha,
+                param.beta);
+            EXPECT_TRUE(success);
+        }
+    }
+};
-        if(optind + 2 > argc)
+template <typename T>
-            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+std::vector<ReduceParam> ReduceWithIndexTest<T>::params = {};
-        data_type   = std::atoi(argv[optind++]);
+using Reduce_float_types       = ::testing::Types<std::tuple<float, float, float>>;
-        init_method = std::atoi(argv[optind]);
+using Reduce_double_types      = ::testing::Types<std::tuple<double, double, double>>;
+using Reduce_int8t_types       = ::testing::Types<std::tuple<int8_t, int8_t, int8_t>>;
+using Reduce_half_types        = ::testing::Types<std::tuple<ck::half_t, ck::half_t, ck::half_t>>;
+using Reduce_bhalf_float_Types = ::testing::Types<std::tuple<ck::bhalf_t, float, ck::bhalf_t>>;
-        if(scales.empty())
+template <typename TType>
-        {
+class ReduceWithNoIndexFloat : public ReduceWithIndexTest<TType>
-            scales.push_back(1.0f);
+{
-            scales.push_back(0.0f);
+};
-        };
-        if(inLengths.size() != 4 ||
+template <typename TType>
-           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+class ReduceWithNoIndexDouble : public ReduceWithIndexTest<TType>
-            return (-1);
+{
+};
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+template <typename TType>
-            return (-1);
+class ReduceWithNoIndexInt8 : public ReduceWithIndexTest<TType>
+{
+};
-        return (0);
+template <typename TType>
-    };
+class ReduceWithNoIndexHalf : public ReduceWithIndexTest<TType>
+{
 };
-bool test_reduce_no_index(int data_type,
+template <typename TType>
-                          int init_method,
+class ReduceWithNoIndexBHalfFloat : public ReduceWithIndexTest<TType>
-                          std::vector<int> reduceDims,
-                          std::vector<size_t> inLengths,
-                          ReduceTensorOp reduceOpId,
-                          bool propagateNan,
-                          float alpha,
-                          float beta)
 {
-    using ck::profiler::profile_reduce_impl;
+};
-    bool result = true;
+TYPED_TEST_SUITE(ReduceWithNoIndexFloat, Reduce_float_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexDouble, Reduce_double_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexInt8, Reduce_int8t_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexHalf, Reduce_half_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexBHalfFloat, Reduce_bhalf_float_Types);
-    if(data_type == 0)
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_AMAX)
-    {
+{
-        result = profile_reduce_impl<float, float, float>(true,
+    // trigger Run() -> Generic
-                                                          init_method,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                          false,
+}
-                                                          false,
-                                                          inLengths,
-                                                          reduceDims,
-                                                          reduceOpId,
-                                                          propagateNan,
-                                                          false,
-                                                          alpha,
-                                                          beta);
-    }
-    else if(data_type == 1)
-    {
-        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
-                                                                    init_method,
-                                                                    false,
-                                                                    false,
-                                                                    inLengths,
-                                                                    reduceDims,
-                                                                    reduceOpId,
-                                                                    propagateNan,
-                                                                    false,
-                                                                    alpha,
-                                                                    beta);
-    }
-    else if(data_type == 3)
-    {
-        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
-                                                              init_method,
-                                                              false,
-                                                              false,
-                                                              inLengths,
-                                                              reduceDims,
-                                                              reduceOpId,
-                                                              propagateNan,
-                                                              false,
-                                                              alpha,
-                                                              beta);
-    }
-    else if(data_type == 5)
-    {
-        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-                                                                      init_method,
-                                                                      false,
-                                                                      false,
-                                                                      inLengths,
-                                                                      reduceDims,
-                                                                      reduceOpId,
-                                                                      propagateNan,
-                                                                      false,
-                                                                      alpha,
-                                                                      beta);
-    }
-    else if(data_type == 6)
-    {
-        result = profile_reduce_impl<double, double, double>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             false,
-                                                             alpha,
-                                                             beta);
-    }
-    return (result);
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_MIN)
-};
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_MAX)
-constexpr bool propagateNan         = false;
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
-int main(int argc, char* argv[])
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_AMAX)
 {
-    SimpleAppArgs args;
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    bool result = true;
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-    if(argc == 1)
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_MAX)
-    {
+{
-        int data_type   = 1;
+    // trigger Run() -> Generic
-        int init_method = 2;
+    this->template Run<ReduceTensorOp::MAX>();
-        std::vector<size_t> inLengths{64, 4, 280, 80};
+}
-        std::vector<std::vector<int>> v_reduceDims{
-            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_AMAX)
+{
-        for(auto& reduceDims : v_reduceDims)
+    // trigger Run() -> Generic
-            result = result && test_reduce_no_index(data_type,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                    init_method,
+}
-                                                    reduceDims,
-                                                    inLengths,
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_MIN)
-                                                    reduceOpId,
+{
-                                                    propagateNan,
+    // trigger Run() -> Generic
-                                                    1.0f,
+    this->template Run<ReduceTensorOp::MIN>();
-                                                    0.0f);
+}
-    }
-    else
-    {
-        if(args.processArgs(argc, argv) < 0)
-        {
-            throw std::runtime_error(
-                "Invalid input arguments, test_reduce_no_index could not be executed!");
-        };
-        result = test_reduce_no_index(args.data_type,
-                                      args.init_method,
-                                      args.reduceDims,
-                                      args.inLengths,
-                                      reduceOpId,
-                                      propagateNan,
-                                      args.scales[0],
-                                      args.scales[1]);
-    }
-    std::cout << "test_reduce_no_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTesBtHalfFloat_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    return (result ? 0 : -1);
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
 }
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <getopt.h>
 #include "ck/library/utility/host_common_util.hpp"
 #include "profiler/profile_reduce_impl.hpp"
+#include <gtest/gtest.h>
 using namespace ck;
-static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+struct ReduceParam
-                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+{
-                                       {"scales", required_argument, nullptr, 'S'},
+    bool do_verification{true};
-                                       {"help", no_argument, nullptr, '?'},
+    bool propagateNan{false};
-                                       {nullptr, 0, nullptr, 0}};
+    bool useIndex{false};
+    bool time_kernel{false};
+    bool do_dumpout{false};
+    int init_method{2};
+    float alpha{1.0f};
+    float beta{0.0f};
+    std::vector<size_t> inLengths{64, 4, 280, 82};
+    std::vector<int> reduceDims{0, 1, 2, 3};
+};
-class SimpleAppArgs
+std::vector<std::vector<int>> SetGenericReduceDim()
 {
-    private:
+    return {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 3}, {0, 2, 3}, {1, 2, 3}, {0}, {1}, {2}, {3}};
-    int option_index = 0;
+}
-    public:
+template <typename T>
-    std::vector<size_t> inLengths;
+class ReduceWithIndexTest : public ::testing::Test
-    std::vector<int> reduceDims;
+{
-    std::vector<float> scales;
+    protected:
+    using InDataType  = std::tuple_element_t<0, T>;
+    using AccDataType = std::tuple_element_t<1, T>;
+    using OutDataType = std::tuple_element_t<2, T>;
-    int data_type;
+    static std::vector<ReduceParam> params;
-    int init_method = 1;
-    public:
+    static void SetUpTestSuite()
-    void show_usage(const char* cmd)
    {
-        std::cout << "Usage of " << cmd << std::endl;
+        // set testcase variables
-        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+        ReduceParam set;
-                     "(only 4-d tensor supported)"
+        const auto setReduceDim = SetGenericReduceDim();
-                  << std::endl;
-        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
-                     "(only 1 or 3 or 4 dimensions supported)"
-                  << std::endl;
-        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
-                  << std::endl;
-        std::cout << "Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
-        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
-                     "value, 3=decimal value)"
-                  << std::endl;
-    };
-    int processArgs(int argc, char* argv[])
-    {
-        using ck::host_common::getTypeValuesFromString;
-        int ch;
-        while(1)
+        for(std::size_t i(0); i < setReduceDim.size(); ++i)
        {
-            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            set.reduceDims = setReduceDim[i];
-            if(ch == -1)
+            params.emplace_back(set);
-                break;
+        }
-            switch(ch)
+    }
+    template <ReduceTensorOp ReduceOpIdType>
+    void Run()
    {
-            case 'D':
+        for(auto param : this->params)
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                inLengths = getTypeValuesFromString<size_t>(optarg);
-                break;
-            case 'R':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                reduceDims = getTypeValuesFromString<int>(optarg);
-                break;
-            case 'S':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                scales = getTypeValuesFromString<float>(optarg);
-                break;
-            case '?':
-                if(std::string(long_options[option_index].name) == "help")
        {
-                    show_usage(argv[0]);
+            bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
-                    return (-1);
+                param.do_verification,
-                };
+                param.init_method,
-                break;
+                param.do_dumpout,
-            default: show_usage(argv[0]); return (-1);
+                param.time_kernel,
-            };
+                param.inLengths,
-        };
+                param.reduceDims,
+                ReduceOpIdType,
+                param.propagateNan,
+                param.useIndex,
+                param.alpha,
+                param.beta);
+            EXPECT_TRUE(success);
+        }
+    }
+};
-        if(optind + 2 > argc)
+template <typename T>
-            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+std::vector<ReduceParam> ReduceWithIndexTest<T>::params = {};
-        data_type   = std::atoi(argv[optind++]);
+using Reduce_float_types       = ::testing::Types<std::tuple<float, float, float>>;
-        init_method = std::atoi(argv[optind]);
+using Reduce_double_types      = ::testing::Types<std::tuple<double, double, double>>;
+using Reduce_int8t_types       = ::testing::Types<std::tuple<int8_t, int8_t, int8_t>>;
+using Reduce_half_types        = ::testing::Types<std::tuple<ck::half_t, ck::half_t, ck::half_t>>;
+using Reduce_bhalf_float_Types = ::testing::Types<std::tuple<ck::bhalf_t, float, ck::bhalf_t>>;
-        if(scales.empty())
+template <typename TType>
-        {
+class ReduceWithIndexFloat : public ReduceWithIndexTest<TType>
-            scales.push_back(1.0f);
+{
-            scales.push_back(0.0f);
+};
-        };
-        if(inLengths.size() != 4 ||
+template <typename TType>
-           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+class ReduceWithIndexDouble : public ReduceWithIndexTest<TType>
-            return (-1);
+{
+};
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+template <typename TType>
-            return (-1);
+class ReduceWithIndexInt8 : public ReduceWithIndexTest<TType>
+{
+};
-        return (0);
+template <typename TType>
-    };
+class ReduceWithIndexHalf : public ReduceWithIndexTest<TType>
+{
 };
-bool test_reduce_with_index(int data_type,
+template <typename TType>
-                            int init_method,
+class ReduceWithIndexBHalfFloat : public ReduceWithIndexTest<TType>
-                            std::vector<int> reduceDims,
-                            std::vector<size_t> inLengths,
-                            ReduceTensorOp reduceOpId,
-                            bool propagateNan,
-                            float alpha,
-                            float beta)
 {
-    using ck::profiler::profile_reduce_impl;
+};
-    bool result = true;
+TYPED_TEST_SUITE(ReduceWithIndexFloat, Reduce_float_types);
+TYPED_TEST_SUITE(ReduceWithIndexDouble, Reduce_double_types);
+TYPED_TEST_SUITE(ReduceWithIndexInt8, Reduce_int8t_types);
+TYPED_TEST_SUITE(ReduceWithIndexHalf, Reduce_half_types);
+TYPED_TEST_SUITE(ReduceWithIndexBHalfFloat, Reduce_bhalf_float_Types);
-    if(data_type == 0)
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_AMAX)
-    {
+{
-        result = profile_reduce_impl<float, float, float>(true,
+    // trigger Run() -> Generic
-                                                          init_method,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                          false,
+}
-                                                          false,
-                                                          inLengths,
-                                                          reduceDims,
-                                                          reduceOpId,
-                                                          propagateNan,
-                                                          true,
-                                                          alpha,
-                                                          beta);
-    }
-    else if(data_type == 1)
-    {
-        result = profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(true,
-                                                                         init_method,
-                                                                         false,
-                                                                         false,
-                                                                         inLengths,
-                                                                         reduceDims,
-                                                                         reduceOpId,
-                                                                         propagateNan,
-                                                                         true,
-                                                                         alpha,
-                                                                         beta);
-    }
-    else if(data_type == 3)
-    {
-        result = profile_reduce_impl<int8_t, int8_t, int8_t>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             true,
-                                                             alpha,
-                                                             beta);
-    }
-    else if(data_type == 5)
-    {
-        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-                                                                      init_method,
-                                                                      false,
-                                                                      false,
-                                                                      inLengths,
-                                                                      reduceDims,
-                                                                      reduceOpId,
-                                                                      propagateNan,
-                                                                      true,
-                                                                      alpha,
-                                                                      beta);
-    }
-    else if(data_type == 6)
-    {
-        result = profile_reduce_impl<double, double, double>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             true,
-                                                             alpha,
-                                                             beta);
-    }
-    return (result);
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_MIN)
-};
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX;
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_MAX)
-constexpr bool propagateNan         = false;
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
-int main(int argc, char* argv[])
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_AMAX)
 {
-    SimpleAppArgs args;
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    bool result = true;
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-    if(argc == 1)
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_MAX)
-    {
+{
-        int data_type   = 1;
+    // trigger Run() -> Generic
-        int init_method = 2;
+    this->template Run<ReduceTensorOp::MAX>();
-        std::vector<size_t> inLengths{64, 4, 280, 80};
+}
-        std::vector<std::vector<int>> v_reduceDims{
-            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_AMAX)
+{
-        for(auto& reduceDims : v_reduceDims)
+    // trigger Run() -> Generic
-            result = result && test_reduce_with_index(data_type,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                      init_method,
+}
-                                                      reduceDims,
-                                                      inLengths,
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_MIN)
-                                                      reduceOpId,
+{
-                                                      propagateNan,
+    // trigger Run() -> Generic
-                                                      1.0f,
+    this->template Run<ReduceTensorOp::MIN>();
-                                                      0.0f);
+}
-    }
-    else
-    {
-        if(args.processArgs(argc, argv) < 0)
-        {
-            throw std::runtime_error(
-                "Invalid input arguments, test_reduce_with_index could not be executed!");
-        };
-        result = test_reduce_with_index(args.data_type,
-                                        args.init_method,
-                                        args.reduceDims,
-                                        args.inLengths,
-                                        reduceOpId,
-                                        propagateNan,
-                                        args.scales[0],
-                                        args.scales[1]);
-    }
-    std::cout << "test_reduce_with_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTesBtHalfFloat_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    return (result ? 0 : -1);
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
 }