Merge remote-tracking branch 'origin/develop' into improve_pipeline

d0b9a467 · Chao Liu · c8f6d5d1 · 8eca05a6 · d0b9a467 · d0b9a467
Commit d0b9a467 authored May 04, 2022 by Chao Liu
17 changed files
--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -2,7 +2,7 @@
 #include "config.hpp"
 #include "device_gemm_reduce_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_operation_instance.hpp"
 namespace ck {
@@ -19,9 +19,9 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ReduceSum       = ck::tensor_operation::element_wise::ReduceSum;
+using ReduceSum   = ck::reduce::Add<F32>;
-using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
+using Square      = ck::tensor_operation::element_wise::UnarySquare<F32, F32, false>;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -30,33 +30,31 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // d1[m] = reduce1(c[m, n])
 using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances = std::tuple<
    // clang-format off
-        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|              D1|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|      D1|   D1EleOp|   GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|    Reduce|          Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //###########################|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation| Operation|       Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //###########################|        |        |        |     |      |      |         |         |          |      |            |            |            |          |                |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
    // clang-format on
    >;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances(
-    std::vector<
+    std::vector<DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, Square>>& instances)
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum>>&
-        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -2,7 +2,7 @@
 #include "config.hpp"
 #include "device_gemm_reduce_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_operation_instance.hpp"
 namespace ck {
@@ -19,9 +19,9 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ReduceSum       = ck::tensor_operation::element_wise::ReduceSum;
+using ReduceSum   = ck::reduce::Add<F32>;
-using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
+using Square      = ck::tensor_operation::element_wise::UnarySquare<F32, F32, false>;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -30,33 +30,31 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // d1[m] = reduce1(c[m, n])
 using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances = std::tuple<
    // clang-format off
-        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|              D1|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|     D1|   D1EleOp|    GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|    Reduce|          Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //###########################|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation| Operation|       Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //###########################|        |        |        |     |      |      |         |         |          |      |            |            |            |          |                |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
    // clang-format on
    >;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances(
-    std::vector<
+    std::vector<DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, Square>>& instances)
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum>>&
-        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -2,7 +2,7 @@
 #include "config.hpp"
 #include "device_gemm_reduce_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_operation_instance.hpp"
 namespace ck {
@@ -19,9 +19,9 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ReduceSum       = ck::tensor_operation::element_wise::ReduceSum;
+using ReduceSum   = ck::reduce::Add<F32>;
-using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
+using Square      = ck::tensor_operation::element_wise::UnarySquare<F32, F32, false>;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -30,33 +30,31 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // d1[m] = reduce1(c[m, n])
 using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances = std::tuple<
    // clang-format off
-        //###########################| ALayout| BLayout| CLayout| AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|              D1|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################| ALayout| BLayout| CLayout| AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|     D1|   D1EleOp|   GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //###########################|        |        |        |  Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|    Reduce|          Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //###########################|        |        |        |      |      |      |         |         |          |      |   Operation|   Operation|   Operation| Operation|       Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //###########################|        |        |        |      |      |      |         |         |          |      |            |            |            |          |                |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
    // clang-format on
    >;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
-    std::vector<
+    std::vector<DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, Square>>& instances)
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum>>&
-        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -2,7 +2,7 @@
 #include "config.hpp"
 #include "device_gemm_reduce_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_operation_instance.hpp"
 namespace ck {
@@ -19,9 +19,9 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ReduceSum       = ck::tensor_operation::element_wise::ReduceSum;
+using ReduceSum   = ck::reduce::Add<F32>;
-using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
+using Square      = ck::tensor_operation::element_wise::UnarySquare<F32, F32, false>;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -30,30 +30,28 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // d1[m] = reduce1(c[m, n])
 using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances = std::tuple<
    // clang-format off
-        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|              D1|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|        D0|    D1|    D1EleOp|    GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|    Reduce|          Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //###########################|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation| Operation|       Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //###########################|        |        |        |     |      |      |         |         |          |      |            |            |            |          |                |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
-        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32, PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSum, Square,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>
    // clang-format on
    >;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances(
-    std::vector<
+    std::vector<DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, Square>>& instances)
-        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, ReduceSum, ReduceSquareSum>>&
-        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances{});

--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -8,7 +8,7 @@
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_gemm_reduce.hpp"
 #include "reference_batched_gemm.hpp"
@@ -21,8 +21,7 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::ReduceSum,
+    ck::tensor_operation::element_wise::UnarySquare<float, float, false>>;
-    ck::tensor_operation::element_wise::ReduceSquareSum>;
 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
    std::vector<DeviceGemmReduceNoOpPtr>&);
@@ -120,17 +119,19 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
    }
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
+    using D0ReduceOp  = ck::reduce::Add<float>;
-    using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
+    using D1ReduceOp  = ck::reduce::Add<float>;
+    using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
-    const auto a_element_op = AElementOp{};
+    const auto a_element_op  = AElementOp{};
-    const auto b_element_op = BElementOp{};
+    const auto b_element_op  = BElementOp{};
-    const auto c_element_op = CElementOp{};
+    const auto c_element_op  = CElementOp{};
-    const auto d0_reduce_op = D0ReduceOp{};
+    const auto d0_reduce_op  = D0ReduceOp{};
-    const auto d1_reduce_op = D1ReduceOp{};
+    const auto d1_reduce_op  = D1ReduceOp{};
+    const auto d1_element_op = D1ElementOp{};
    if(do_verification)
    {
@@ -154,17 +155,21 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        {
            for(int m = 0; m < M; ++m)
            {
-                float d0_acc = d0_reduce_op.GetReduceZeroValue();
+                float d0_acc = d0_reduce_op.GetReductionZeroVal();
-                float d1_acc = d1_reduce_op.GetReduceZeroValue();
+                float d1_acc = d1_reduce_op.GetReductionZeroVal();
                for(int n = 0; n < N; ++n)
                {
-                    d0_reduce_op.Reduce(d0_acc, c_g_m_n_host_result(batch, m, n));
+                    float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
-                    d1_reduce_op.Reduce(d1_acc, c_g_m_n_host_result(batch, m, n));
+                    float d1_val;
+                    d1_element_op(d1_val, d0_val);
+                    d0_reduce_op(d0_acc, d0_val);
+                    d1_reduce_op(d1_acc, d1_val);
                }
-                d0_g_m_host_result(batch, m) = d0_acc;
+                d0_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d0_acc);
-                d1_g_m_host_result(batch, m) = d1_acc;
+                d1_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d1_acc);
            }
        }
    }
@@ -247,8 +252,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          d0_reduce_op,
+                                          d1_element_op,
-                                          d1_reduce_op,
                                          BatchCount);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -7,7 +7,7 @@
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
 #include "element_wise_operation.hpp"
-#include "element_wise_reduce_operation.hpp"
+#include "reduction_operator.hpp"
 #include "device_gemm_reduce.hpp"
 #include "reference_gemm.hpp"
@@ -20,8 +20,7 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::ReduceSum,
+    ck::tensor_operation::element_wise::UnarySquare<float, float, false>>;
-    ck::tensor_operation::element_wise::ReduceSquareSum>;
 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
    std::vector<DeviceGemmReduceNoOpPtr>&);
@@ -113,17 +112,19 @@ bool profile_gemm_reduce_impl(int do_verification,
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
    }
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
-    using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
+    using D0ReduceOp  = ck::reduce::Add<float>;
-    using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
+    using D1ReduceOp  = ck::reduce::Add<float>;
+    using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
-    const auto a_element_op = AElementOp{};
+    const auto a_element_op  = AElementOp{};
-    const auto b_element_op = BElementOp{};
+    const auto b_element_op  = BElementOp{};
-    const auto c_element_op = CElementOp{};
+    const auto c_element_op  = CElementOp{};
-    const auto d0_reduce_op = D0ReduceOp{};
+    const auto d0_reduce_op  = D0ReduceOp{};
-    const auto d1_reduce_op = D1ReduceOp{};
+    const auto d1_reduce_op  = D1ReduceOp{};
+    const auto d1_element_op = D1ElementOp{};
    if(do_verification)
    {
@@ -140,17 +141,21 @@ bool profile_gemm_reduce_impl(int do_verification,
        for(int m = 0; m < M; ++m)
        {
-            float d0_acc = d0_reduce_op.GetReduceZeroValue();
+            float d0_acc = d0_reduce_op.GetReductionZeroVal();
-            float d1_acc = d1_reduce_op.GetReduceZeroValue();
+            float d1_acc = d1_reduce_op.GetReductionZeroVal();
            for(int n = 0; n < N; ++n)
            {
-                d0_reduce_op.Reduce(d0_acc, c_m_n_host_result(m, n));
+                float d0_val = ck::type_convert<float>(c_m_n_host_result(m, n));
-                d1_reduce_op.Reduce(d1_acc, c_m_n_host_result(m, n));
+                float d1_val;
+                d1_element_op(d1_val, d0_val);
+                d0_reduce_op(d0_acc, d0_val);
+                d1_reduce_op(d1_acc, d1_val);
            }
-            d0_m_host_result(m) = d0_acc;
+            d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc);
-            d1_m_host_result(m) = d1_acc;
+            d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
        }
    }
@@ -232,8 +237,7 @@ bool profile_gemm_reduce_impl(int do_verification,
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          d0_reduce_op,
+                                          d1_element_op);
-                                          d1_reduce_op);
        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();

--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
-find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,6 +24,7 @@ include_directories(BEFORE
 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
 add_custom_target(tests)
 function(add_test_executable TEST_NAME)
    message("adding test ${TEST_NAME}")
    add_executable(${TEST_NAME} ${ARGN})
@@ -32,6 +33,20 @@ function(add_test_executable TEST_NAME)
    add_dependencies(check ${TEST_NAME})
 endfunction(add_test_executable TEST_NAME)
+include(GoogleTest)
+function(add_gtest_executable TEST_NAME)
+    message("adding gtest ${TEST_NAME}")
+    add_executable(${TEST_NAME} ${ARGN})
+    add_dependencies(tests ${TEST_NAME})
+    add_dependencies(check ${TEST_NAME})
+    # suppress gtest warnings
+    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors)
+    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
+    gtest_discover_tests(${TEST_NAME})
+endfunction(add_gtest_executable TEST_NAME)
 add_subdirectory(magic_number_division)
 add_subdirectory(space_filling_curve)
 add_subdirectory(conv_util)

--- a/test/conv_util/CMakeLists.txt
+++ b/test/conv_util/CMakeLists.txt
-add_test_executable(test_conv_util conv_util.cpp)
+add_gtest_executable(test_conv_util conv_util.cpp)
 target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util)
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
 #include <iostream>
 #include <string>
 #include <vector>
+#include "gtest/gtest.h"
 #include "config.hpp"
 #include "conv_fwd_util.hpp"
@@ -9,196 +10,194 @@
 namespace {
-bool test_conv_params_get_output_spatial_lengths()
+class TestConvUtil : public ::testing::Test
 {
-    bool res{true};
+    public:
-    // -------------------------- default 2D ------------------------------------
+    void SetNDParams(std::size_t ndims)
+    {
+        conv_params.num_dim_spatial        = ndims;
+        conv_params.filter_spatial_lengths = std::vector<ck::index_t>(ndims, 3);
+        conv_params.input_spatial_lengths  = std::vector<ck::index_t>(ndims, 71);
+        conv_params.conv_filter_strides    = std::vector<ck::index_t>(ndims, 2);
+        conv_params.conv_filter_dilations  = std::vector<ck::index_t>(ndims, 1);
+        conv_params.input_left_pads        = std::vector<ck::index_t>(ndims, 1);
+        conv_params.input_right_pads       = std::vector<ck::index_t>(ndims, 1);
+    }
+    protected:
+    // -------  default 2D -------
    // input NCHW {128,192,71,71},
    // weights KCYX {256,192,3,3},
    // stride {2,2},
    // dilations {1,1},
    // padding {{1,1}, {1,1}}
    ck::utils::conv::ConvParams conv_params;
+};
+} // namespace
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
+{
+    ck::utils::conv::ConvParams conv_params;
    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
-    res                                      = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{36, 36},
+                                     std::vector<ck::index_t>{36, 36},
-                               "Error: ConvParams 2D default constructor.");
+                                     "Error: ConvParams 2D default constructor."));
    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");
+        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2};
    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{37, 37},
+                                     std::vector<ck::index_t>{37, 37},
-                               "Error: ConvParams 2D padding left/right {2,2}.");
+                                     "Error: ConvParams 2D padding left/right {2,2}."));
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");
+        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3};
    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1};
    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1};
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res =
+    EXPECT_TRUE(
        ck::utils::check_err(out_spatial_len,
                             std::vector<ck::index_t>{23, 23},
-                             "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
+                             "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."));
+}
-    // -------------------------- 1D ------------------------------------
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
-    conv_params.num_dim_spatial        = 1;
+{
-    conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    SetNDParams(1);
-    conv_params.input_spatial_lengths  = std::vector<ck::index_t>{71};
-    conv_params.conv_filter_strides    = std::vector<ck::index_t>{2};
-    conv_params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    conv_params.input_left_pads        = std::vector<ck::index_t>{1};
-    conv_params.input_right_pads       = std::vector<ck::index_t>{1};
-    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
-    res             = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D.");
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
    conv_params.conv_filter_strides = std::vector<ck::index_t>{1};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
+        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
    conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
    conv_params.input_left_pads     = std::vector<ck::index_t>{2};
    conv_params.input_right_pads    = std::vector<ck::index_t>{2};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{37},
+                                     std::vector<ck::index_t>{37},
-                               "Error: ConvParams 1D padding left/right {2}.");
+                                     "Error: ConvParams 1D padding left/right {2}."));
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3};
    conv_params.input_left_pads       = std::vector<ck::index_t>{1};
    conv_params.input_right_pads      = std::vector<ck::index_t>{1};
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(
-                               std::vector<ck::index_t>{23},
+        ck::utils::check_err(out_spatial_len,
-                               "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
+                             std::vector<ck::index_t>{23},
+                             "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."));
-    // -------------------------- 3D ------------------------------------
+}
-    conv_params.num_dim_spatial        = 3;
-    conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
-    conv_params.input_spatial_lengths  = std::vector<ck::index_t>{71, 71, 71};
+{
-    conv_params.conv_filter_strides    = std::vector<ck::index_t>{2, 2, 2};
+    SetNDParams(3);
-    conv_params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    conv_params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
-    conv_params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
-    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    res             = ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D.");
    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{71, 71, 71},
+                                     std::vector<ck::index_t>{71, 71, 71},
-                               "Error: ConvParams 3D stride {1, 1, 1}.");
+                                     "Error: ConvParams 3D stride {1, 1, 1}."));
    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2, 2};
    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2, 2};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{37, 37, 37},
+                                     std::vector<ck::index_t>{37, 37, 37},
-                               "Error: ConvParams 3D padding left/right {2, 2, 2}.");
+                                     "Error: ConvParams 3D padding left/right {2, 2, 2}."));
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = ck::utils::check_err(out_spatial_len,
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                               std::vector<ck::index_t>{36, 36, 36},
+                                     std::vector<ck::index_t>{36, 36, 36},
-                               "Error: ConvParams 3D dilation {2, 2, 2}.");
+                                     "Error: ConvParams 3D dilation {2, 2, 2}."));
    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3, 3};
    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1, 1};
    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1, 1};
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
        out_spatial_len,
        std::vector<ck::index_t>{23, 23, 23},
-        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}.");
+        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."));
-    return res;
 }
-bool test_get_host_tensor_descriptor()
+TEST(ConvUtil, GetHostTensorDescriptor)
 {
-    bool res{true};
    namespace tl = ck::tensor_layout::convolution;
    std::vector<std::size_t> dims{2, 3, 4, 5};
    HostTensorDescriptor h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
-    res =
+    EXPECT_TRUE(ck::utils::check_err(
-        ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
+        h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!"));
-    res = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
+        h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!"));
    h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCHW{});
-    res =
+    EXPECT_TRUE(ck::utils::check_err(
-        ck::utils::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
+        h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!"));
-    res = ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(
-        h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
+        h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!"));
    dims = std::vector<std::size_t>{2, 3, 4};
    h    = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
-    res  = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
+    EXPECT_TRUE(
-    res =
+        ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!"));
-        ck::utils::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
+    EXPECT_TRUE(ck::utils::check_err(
+        h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!"));
-    h   = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{});
+    h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCW{});
-    res = ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
+    EXPECT_TRUE(
-    res =
+        ck::utils::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!"));
-        ck::utils::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
+    EXPECT_TRUE(ck::utils::check_err(
+        h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!"));
    dims = std::vector<std::size_t>{2, 3, 4, 5, 6};
    h    = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
-    res  = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!");
+    EXPECT_TRUE(
-    res  = ck::utils::check_err(h.GetStrides(),
+        ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!"));
-                               {3 * 4 * 5 * 6, // N
+    EXPECT_TRUE(ck::utils::check_err(h.GetStrides(),
-                                1,             // C
+                                     {3 * 4 * 5 * 6, // N
-                                3 * 5 * 6,     // D
+                                      1,             // C
-                                3 * 6,         // H
+                                      3 * 5 * 6,     // D
-                                3},            // W
+                                      3 * 6,         // H
-                               "Error: wrong NDHWC dimensions strides!");
+                                      3},            // W
+                                     "Error: wrong NDHWC dimensions strides!"));
-    h   = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{});
-    res = ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!");
+    h = ck::utils::conv::get_host_tensor_descriptor(dims, tl::NCDHW{});
-    res = ck::utils::check_err(h.GetStrides(),
+    EXPECT_TRUE(
-                               {3 * 4 * 5 * 6, // N
+        ck::utils::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!"));
-                                4 * 5 * 6,     // C
+    EXPECT_TRUE(ck::utils::check_err(h.GetStrides(),
-                                5 * 6,         // D
+                                     {3 * 4 * 5 * 6, // N
-                                6,             // H
+                                      4 * 5 * 6,     // C
-                                1},            // W
+                                      5 * 6,         // D
-                               "Error: wrong NCDHW dimensions strides!");
+                                      6,             // H
+                                      1},            // W
-    return res;
+                                     "Error: wrong NCDHW dimensions strides!"));
-}
-} // namespace
-int main(void)
-{
-    bool res = test_conv_params_get_output_spatial_lengths();
-    std::cout << "test_conv_params_get_output_spatial_lengths ..... "
-              << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_get_host_tensor_descriptor();
-    std::cout << "test_get_host_tensor_descriptor ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    return res ? 0 : 1;
 }
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
 add_custom_target(test_convnd_fwd)
-add_test_executable(test_conv1d_fwd conv1d_fwd.cpp)
+add_gtest_executable(test_conv1d_fwd conv1d_fwd.cpp)
 target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util)
-target_link_libraries(test_conv1d_fwd PRIVATE )
 add_dependencies(test_convnd_fwd test_conv1d_fwd)
-add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
+add_gtest_executable(test_conv2d_fwd conv2d_fwd.cpp)
 target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util)
 add_dependencies(test_convnd_fwd test_conv2d_fwd)
-add_test_executable(test_conv3d_fwd conv3d_fwd.cpp)
+add_gtest_executable(test_conv3d_fwd conv3d_fwd.cpp)
 target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util)
 add_dependencies(test_convnd_fwd test_conv3d_fwd)
--- a/test/convnd_fwd/conv1d_fwd.cpp
+++ b/test/convnd_fwd/conv1d_fwd.cpp
@@ -2,6 +2,7 @@
 #include <stdexcept>
 #include <tuple>
 #include <vector>
+#include "gtest/gtest.h"
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
@@ -10,7 +11,8 @@
 namespace {
-bool test_conv1D_nwc()
+template <typename T>
+bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
 {
    using namespace std::placeholders;
    using namespace ck::utils;
@@ -18,31 +20,24 @@ bool test_conv1D_nwc()
    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 1;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 4;
    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{71};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_strides    = std::vector<ck::index_t>{2};
    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
    params.input_left_pads        = std::vector<ck::index_t>{1};
    params.input_right_pads       = std::vector<ck::index_t>{1};
-    std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
+    conv::ConvFwdOpInstance<T, T, T, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(params);
-    test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs);
-    conv::ConvFwdOpInstance<float, float, float, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(
-        params);
-    auto reference_conv_fwd_fun = std::bind(
+    auto reference_conv_fwd_fun =
-        conv::run_reference_convolution_forward<1, float, float, float>, params, _1, _2, _3);
+        std::bind(conv::run_reference_convolution_forward<1, T, T, T>, params, _1, _2, _3);
-    OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
+    OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
-    run_engine.SetAtol(1e-5);
-    run_engine.SetRtol(1e-4);
    return run_engine.Test(conv_ptrs);
 }
-template <typename T>
+} // anonymous namespace
-bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
+TEST(Conv1DFwdNWC, TestConv1D)
 {
    using namespace std::placeholders;
    using namespace ck::utils;
@@ -50,65 +45,49 @@ bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPt
    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 1;
+    params.N                      = 2;
+    params.K                      = 16;
+    params.C                      = 4;
    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{71};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2};
+    params.conv_filter_strides    = std::vector<ck::index_t>{1};
    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
    params.input_left_pads        = std::vector<ck::index_t>{1};
    params.input_right_pads       = std::vector<ck::index_t>{1};
-    conv::ConvFwdOpInstance<T, T, T, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(params);
+    std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
+    test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs);
-    auto reference_conv_fwd_fun =
+    conv::ConvFwdOpInstance<float, float, float, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(
-        std::bind(conv::run_reference_convolution_forward<1, T, T, T>, params, _1, _2, _3);
+        params);
-    OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
-    return run_engine.Test(conv_ptrs);
-}
-bool test_conv1d_nwc_bf16_instances()
+    auto reference_conv_fwd_fun = std::bind(
-{
+        conv::run_reference_convolution_forward<1, float, float, float>, params, _1, _2, _3);
-    return test_conv1d_nwc_instances<ck::bhalf_t>(
+    OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
-        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<1>());
+    run_engine.SetAtol(1e-5);
+    run_engine.SetRtol(1e-4);
+    EXPECT_TRUE(run_engine.Test(conv_ptrs));
 }
-bool test_conv1d_nwc_f16_instances()
+TEST(Conv1DFwdNWC, Bf16Iinstances)
 {
-    return test_conv1d_nwc_instances<ck::half_t>(
+    EXPECT_TRUE(test_conv1d_nwc_instances<ck::bhalf_t>(
-        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<1>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<1>()));
 }
-bool test_conv1d_nwc_f32_instances()
+TEST(Conv1DFwdNWC, F16Instances)
 {
-    return test_conv1d_nwc_instances<float>(
+    EXPECT_TRUE(test_conv1d_nwc_instances<ck::half_t>(
-        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<1>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<1>()));
 }
-bool test_conv1d_nwc_int8_instances()
+TEST(Conv1DFwdNWC, F32Instances)
 {
-    return test_conv1d_nwc_instances<int8_t>(
+    EXPECT_TRUE(test_conv1d_nwc_instances<float>(
-        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<1>());
+        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<1>()));
 }
-} // anonymous namespace
+TEST(Conv1DFwdNWC, Int8Instances)
-int main()
 {
-    bool res{true};
+    EXPECT_TRUE(test_conv1d_nwc_instances<int8_t>(
-    res = test_conv1D_nwc();
+        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<1>()));
-    std::cout << "test_conv1D_nwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv1d_nwc_bf16_instances();
-    std::cout << "\nTestConv1DNWCBF16Instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv1d_nwc_f16_instances();
-    std::cout << "\ntest_conv1d_nwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv1d_nwc_f32_instances();
-    std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv1d_nwc_int8_instances();
-    std::cout << "\ntest_conv1d_nwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    return res ? 0 : 1;
 }
--- a/test/convnd_fwd/conv2d_fwd.cpp
+++ b/test/convnd_fwd/conv2d_fwd.cpp
@@ -2,6 +2,7 @@
 #include <iostream>
 #include <tuple>
 #include <vector>
+#include "gtest/gtest.h"
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
@@ -10,30 +11,6 @@
 namespace {
-bool test_conv2d_nhwc()
-{
-    using namespace std::placeholders;
-    using namespace ck::utils;
-    ck::utils::conv::ConvParams params;
-    params.N                     = 2;
-    params.K                     = 16;
-    params.C                     = 4;
-    params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
-    params.conv_filter_strides   = std::vector<ck::index_t>{1, 1};
-    std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
-    test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs);
-    conv::ConvFwdOpInstance<float, float, float> conv_instance(params);
-    auto reference_conv_fwd_fun = std::bind(
-        conv::run_reference_convolution_forward<2, float, float, float>, params, _1, _2, _3);
-    OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
-    run_engine.SetAtol(1e-5);
-    run_engine.SetRtol(1e-4);
-    return run_engine.Test(conv_ptrs);
-}
 template <typename T>
 bool test_conv2d_nhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
 {
@@ -57,50 +34,58 @@ bool test_conv2d_nhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpP
    return run_engine.Test(conv_ptrs);
 }
-bool test_conv2d_nhwc_bf16_instances()
+} // anonymous namespace
+TEST(Conv2DFwdNHWC, TestConv2D)
 {
-    return test_conv2d_nhwc_instances<ck::bhalf_t>(
+    using namespace std::placeholders;
-        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<2>());
+    using namespace ck::utils;
+    ck::utils::conv::ConvParams params;
+    params.N                     = 2;
+    params.K                     = 16;
+    params.C                     = 4;
+    params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
+    params.conv_filter_strides   = std::vector<ck::index_t>{1, 1};
+    std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
+    test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs);
+    conv::ConvFwdOpInstance<float, float, float> conv_instance(params);
+    auto reference_conv_fwd_fun = std::bind(
+        conv::run_reference_convolution_forward<2, float, float, float>, params, _1, _2, _3);
+    OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
+    run_engine.SetAtol(1e-5);
+    run_engine.SetRtol(1e-4);
+    EXPECT_TRUE(run_engine.Test(conv_ptrs));
 }
-bool test_conv2d_nhwc_f16_instances()
+TEST(Conv2DFwdNHWC, Bf16Instances)
 {
-    return test_conv2d_nhwc_instances<ck::half_t>(
+    EXPECT_TRUE(test_conv2d_nhwc_instances<ck::bhalf_t>(
-        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<2>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<2>()));
 }
-bool test_conv2d_nhwc_f32_instances()
+TEST(Conv2DFwdNHWC, F16Instances)
 {
-    return test_conv2d_nhwc_instances<float>(
+    EXPECT_TRUE(test_conv2d_nhwc_instances<ck::half_t>(
-        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<2>()));
 }
-bool test_conv2d_nhwc_int8_instances()
+TEST(Conv2DFwdNHWC, BF32Instances)
 {
-    return test_conv2d_nhwc_instances<int8_t>(
+    EXPECT_TRUE(test_conv2d_nhwc_instances<float>(
-        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<2>());
+        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>()));
 }
-} // anonymous namespace
+TEST(Conv2DFwdNHWC, F32Instances)
+{
+    EXPECT_TRUE(test_conv2d_nhwc_instances<float>(
+        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<2>()));
+}
-int main()
+TEST(Conv2DFwdNHWC, Int8Instances)
 {
-    bool res{true};
+    EXPECT_TRUE(test_conv2d_nhwc_instances<int8_t>(
-    res = test_conv2d_nhwc();
+        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<2>()));
-    std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv2d_nhwc_bf16_instances();
-    std::cout << "\ntest_conv2d_nhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv2d_nhwc_f16_instances();
-    std::cout << "\ntest_conv2d_nhwc_f16_instances ....." << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv2d_nhwc_f32_instances();
-    std::cout << "\ntest_conv2d_nhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv2d_nhwc_int8_instances();
-    std::cout << "\ntest_conv2d_nhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    return res ? 0 : 1;
 }
--- a/test/convnd_fwd/conv3d_fwd.cpp
+++ b/test/convnd_fwd/conv3d_fwd.cpp
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <tuple>
 #include <vector>
+#include "gtest/gtest.h"
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
@@ -11,7 +12,34 @@
 namespace {
-bool test_conv3d_ndhwc()
+template <typename T>
+bool test_conv3d_ndhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
+{
+    using namespace std::placeholders;
+    using namespace ck::utils;
+    namespace ctl = ck::tensor_layout::convolution;
+    conv::ConvParams params;
+    params.N                      = 64;
+    params.num_dim_spatial        = 3;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{32, 32, 2};
+    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2, 2};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
+    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    conv::ConvFwdOpInstance<T, T, T, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(params);
+    auto reference_conv_fwd_fun =
+        std::bind(conv::run_reference_convolution_forward<3, T, T, T>, params, _1, _2, _3);
+    OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
+    return run_engine.Test(conv_ptrs);
+}
+} // anonymous namespace
+TEST(Conv3DFwdNDHWC, TestConv3D)
 {
    using namespace std::placeholders;
    using namespace ck::utils;
@@ -39,10 +67,10 @@ bool test_conv3d_ndhwc()
    OpInstanceRunEngine<float, float, float> run_engine(conv_instance, reference_conv_fwd_fun);
    run_engine.SetAtol(1e-5);
    run_engine.SetRtol(1e-4);
-    return run_engine.Test(conv_ptrs);
+    EXPECT_TRUE(run_engine.Test(conv_ptrs));
 }
-bool test_conv3d_ndhwc_2gb_input()
+TEST(Conv3DFwdNDHWC, InputOver2GB)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    using namespace ck::utils;
@@ -79,10 +107,10 @@ bool test_conv3d_ndhwc_2gb_input()
                                                     PassThrough{},
                                                     PassThrough{},
                                                     PassThrough{});
-    return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
+    EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
 }
-bool test_conv3d_ndhwc_2gb_filters()
+TEST(Conv3DFwdNDHWC, FiltersOver2GB)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    using namespace ck::utils;
@@ -119,10 +147,10 @@ bool test_conv3d_ndhwc_2gb_filters()
                                                     PassThrough{},
                                                     PassThrough{},
                                                     PassThrough{});
-    return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
+    EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
 }
-bool test_conv3d_ndhwc_2gb_output()
+TEST(Conv3DFwdNDHWC, OutputOver2GB)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    using namespace ck::utils;
@@ -158,88 +186,29 @@ bool test_conv3d_ndhwc_2gb_output()
                                                     PassThrough{},
                                                     PassThrough{},
                                                     PassThrough{});
-    return !(conv_ptrs.back()->IsSupportedArgument(arg.get()));
+    EXPECT_FALSE(conv_ptrs.back()->IsSupportedArgument(arg.get()));
-}
-template <typename T>
-bool test_conv3d_ndhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPtr>& conv_ptrs)
-{
-    using namespace std::placeholders;
-    using namespace ck::utils;
-    namespace ctl = ck::tensor_layout::convolution;
-    conv::ConvParams params;
-    params.N                      = 64;
-    params.num_dim_spatial        = 3;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{32, 32, 2};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
-    conv::ConvFwdOpInstance<T, T, T, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(params);
-    auto reference_conv_fwd_fun =
-        std::bind(conv::run_reference_convolution_forward<3, T, T, T>, params, _1, _2, _3);
-    OpInstanceRunEngine<T, T, T> run_engine(conv_instance, reference_conv_fwd_fun);
-    return run_engine.Test(conv_ptrs);
 }
-bool test_conv3d_ndhwc_bf16_instances()
+TEST(Conv3DFwdNDHWC, Bf16Instances)
 {
-    return test_conv3d_ndhwc_instances<ck::bhalf_t>(
+    EXPECT_TRUE(test_conv3d_ndhwc_instances<ck::bhalf_t>(
-        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<3>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>::Get<3>()));
 }
-bool test_conv3d_ndhwc_f16_instances()
+TEST(Conv3DFwdNDHWC, F16Instances)
 {
-    return test_conv3d_ndhwc_instances<ck::half_t>(
+    EXPECT_TRUE(test_conv3d_ndhwc_instances<ck::half_t>(
-        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<3>());
+        ck::utils::conv::ConvolutionFwdInstances<ck::half_t, ck::half_t, ck::half_t>::Get<3>()));
 }
-bool test_conv3d_ndhwc_f32_instances()
+TEST(Conv3DFwdNDHWC, F32Instances)
 {
-    return test_conv3d_ndhwc_instances<float>(
+    EXPECT_TRUE(test_conv3d_ndhwc_instances<float>(
-        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<3>());
+        ck::utils::conv::ConvolutionFwdInstances<float, float, float>::Get<3>()));
 }
-bool test_conv3d_ndhwc_int8_instances()
+TEST(Conv3DFwdNDHWC, Int8Instances)
-{
-    return test_conv3d_ndhwc_instances<int8_t>(
-        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<3>());
-}
-} // anonymous namespace
-int main()
 {
-    bool res{true};
+    EXPECT_TRUE(test_conv3d_ndhwc_instances<int8_t>(
-    res = test_conv3d_ndhwc();
+        ck::utils::conv::ConvolutionFwdInstances<int8_t, int8_t, int8_t>::Get<3>()));
-    std::cout << "test_conv3d_ndhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv3d_ndhwc_2gb_input();
-    std::cout << "\ntest_conv3d_ndhwc_2gb_input ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_2gb_filters();
-    std::cout << "\ntest_conv3d_ndhwc_2gb_filters ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_2gb_output();
-    std::cout << "\ntest_conv3d_ndhwc_2gb_output ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_bf16_instances();
-    std::cout << "\ntest_conv3d_ndhwc_bf16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_f16_instances();
-    std::cout << "\ntest_conv3d_ndhwc_f16_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_f32_instances();
-    std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    res = test_conv3d_ndhwc_int8_instances();
-    std::cout << "\ntest_conv3d_ndhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE")
-              << std::endl;
-    return res ? 0 : 1;
 }
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
 #ifndef GEMM_UTILS_HPP
 #define GEMM_UTILS_HPP
 #include "check_err.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "reference_gemm.hpp"
 #include "tensor_layout.hpp"
 namespace ck {
 namespace gemm_util {
 struct GemmParams
 {
    GemmParams()
        : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
    {
    }
    ck::index_t M;
    ck::index_t N;
    ck::index_t K;
    ck::index_t StrideA;
    ck::index_t StrideB;
    ck::index_t StrideC;
    float alpha;
    float beta;
 };
 template <typename GemmInstance,
          typename ADataType,
          typename BDataType,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
 void RunHostGEMM(const Tensor<ADataType>& A,
                 const Tensor<BDataType>& B,
                 Tensor<CDataType>& C,
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op)
 {
    auto ref_gemm    = GemmInstance{};
    auto ref_invoker = ref_gemm.MakeInvoker();
    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
    ref_invoker.Run(ref_argument);
 }
 template <typename DeviceGemmPtr_,
          typename ADataType,
          typename BDataType,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
 void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
                   const ck::gemm_util::GemmParams& params,
                   const Tensor<ADataType>& A,
                   const Tensor<BDataType>& B,
                   Tensor<CDataType>& C,
                   AElementwiseOperation a_element_op,
                   BElementwiseOperation b_element_op,
                   CElementwiseOperation c_element_op)
 {
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
    a_m_k_device_buf.ToDevice(A.mData.data());
    b_k_n_device_buf.ToDevice(B.mData.data());
    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
    auto argument_ptr =
        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
                                     static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
                                     static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                     params.M,
                                     params.N,
                                     params.K,
                                     params.StrideA,
                                     params.StrideB,
                                     params.StrideC,
                                     a_element_op,
                                     b_element_op,
                                     c_element_op);
    if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
    {
        throw std::runtime_error(
            "wrong! device_gemm with the specified compilation parameters does "
            "not support this GEMM problem");
    }
    invoker_ptr->Run(argument_ptr.get());
    c_m_n_device_buf.FromDevice(C.mData.data());
 }
 template <typename DeviceGemmPtr_,
          typename ADataType,
          typename BDataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
 struct TestGemm
 {
    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
    {
        auto f_host_tensor_descriptor =
            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
                {
                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                                std::vector<std::size_t>({stride, 1}));
                }
                else
                {
                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                                std::vector<std::size_t>({1, stride}));
                }
            };
        Tensor<ADataType> a_m_k(
            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
        Tensor<BDataType> b_k_n(
            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
        Tensor<CDataType> c_m_n_host_result(
            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
        Tensor<CDataType> c_m_n_device_result(
            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        auto f_generate_tensor_value = [](auto desc, auto type) {
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
            using dataType = decltype(type);
-            if(std::is_same<dataType, int8_t>::value)
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
-            {
+        };
-                desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
-            }
+        f_generate_tensor_value(a_m_k, ADataType{});
-            else
+        f_generate_tensor_value(b_k_n, BDataType{});
-            {
-                desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5});
+        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
-            }
+    }
-        };
+    auto operator()(DeviceGemmPtr_& gemmPtr)
-        f_generate_tensor_value(a_m_k, ADataType{});
+    {
-        f_generate_tensor_value(b_k_n, BDataType{});
+        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+                  << ", CLayout = " << CLayout{}.name << std::endl;
-        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
+        std::cout << gemmPtr->GetTypeString() << std::endl;
-    }
+        // Arrange
-    auto operator()(DeviceGemmPtr_& gemmPtr)
+        ck::gemm_util::GemmParams params;
-    {
+        params.M       = 1024;
-        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+        params.N       = 1024;
-                  << ", CLayout = " << CLayout{}.name << std::endl;
+        params.K       = 1024;
-        std::cout << gemmPtr->GetTypeString() << std::endl;
+        params.StrideA = 1024;
+        params.StrideB = 1024;
-        // Arrange
+        params.StrideC = 1024;
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
+        auto host_tensors = PrepareGemmTensor(params);
-        params.N       = 1024;
-        params.K       = 1024;
+        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
-        params.StrideA = 1024;
+        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-        params.StrideB = 1024;
+        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
-        params.StrideC = 1024;
+        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
-        auto host_tensors = PrepareGemmTensor(params);
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
-        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+        auto c_element_op = CElementwiseOperation{};
-        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+        using ReferenceGemmInstance =
-        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+            ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                      BDataType,
-        auto a_element_op = AElementwiseOperation{};
+                                                      CDataType,
-        auto b_element_op = BElementwiseOperation{};
+                                                      AElementwiseOperation,
-        auto c_element_op = CElementwiseOperation{};
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
-        using ReferenceGemmInstance =
+        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            ck::tensor_operation::host::ReferenceGemm<ADataType,
+            a, b, c_host, a_element_op, b_element_op, c_element_op);
-                                                      BDataType,
-                                                      CDataType,
+        // Act
-                                                      AElementwiseOperation,
+        ck::gemm_util::RunDeviceGEMM(
-                                                      BElementwiseOperation,
+            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
-                                                      CElementwiseOperation>;
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
+        // Assert
-            a, b, c_host, a_element_op, b_element_op, c_element_op);
+        bool res = false;
+        if(std::is_same<CDataType, float>::value)
-        // Act
+        {
-        ck::gemm_util::RunDeviceGEMM(
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        }
-        // Assert
+        else if(std::is_same<CDataType, ck::half_t>::value)
-        bool res = false;
+        {
-        if(std::is_same<CDataType, float>::value)
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
-        {
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
+        }
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        else if(std::is_same<CDataType, int8_t>::value)
-        }
+        {
-        else if(std::is_same<CDataType, ck::half_t>::value)
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
-        {
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
+        }
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
+        return res;
-        else if(std::is_same<CDataType, int8_t>::value)
+    }
-        {
+};
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+template <typename DeviceGemmPtr_,
-        }
+          typename ALayout,
+          typename BLayout,
-        return res;
+          typename CLayout,
-    }
+          typename AElementwiseOperation,
-};
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
-template <typename DeviceGemmPtr_,
+struct TestGemmBF16
-          typename ALayout,
+{
-          typename BLayout,
+    using BF16 = ck::bhalf_t;
-          typename CLayout,
-          typename AElementwiseOperation,
+    auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
-          typename BElementwiseOperation,
+    {
-          typename CElementwiseOperation>
+        auto f_host_tensor_descriptor =
-struct TestGemmBF16
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-{
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-    using BF16 = ck::bhalf_t;
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-    auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
+                                                std::vector<std::size_t>({stride, 1}));
-    {
+                }
-        auto f_host_tensor_descriptor =
+                else
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                {
+                                                std::vector<std::size_t>({1, stride}));
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                }
-                                                std::vector<std::size_t>({stride, 1}));
+            };
-                }
-                else
+        // use fp32 host kernel to verify bf16 device kernel
-                {
+        Tensor<BF16> a_m_k_bf16(
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-                                                std::vector<std::size_t>({1, stride}));
+        Tensor<BF16> b_k_n_bf16(
-                }
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-            };
+        Tensor<BF16> c_m_n_device_bf16(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        // use fp32 host kernel to verify bf16 device kernel
-        Tensor<BF16> a_m_k_bf16(
+        Tensor<float> a_m_k_fp32(
            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BF16> b_k_n_bf16(
+        Tensor<float> b_k_n_fp32(
            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<BF16> c_m_n_device_bf16(
+        Tensor<float> c_m_n_host_fp32(
            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<float> c_m_n_device_fp32(
-        Tensor<float> a_m_k_fp32(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<float> b_k_n_fp32(
+        a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-        Tensor<float> c_m_n_host_fp32(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
-        Tensor<float> c_m_n_device_fp32(
+        bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        return std::make_tuple(a_m_k_bf16,
-        a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
+                               b_k_n_bf16,
-        b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
+                               c_m_n_device_bf16,
+                               a_m_k_fp32,
-        bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
+                               b_k_n_fp32,
-        bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
+                               c_m_n_host_fp32,
+                               c_m_n_device_fp32);
-        return std::make_tuple(a_m_k_bf16,
+    }
-                               b_k_n_bf16,
-                               c_m_n_device_bf16,
+    auto operator()(DeviceGemmPtr_& gemmPtr)
-                               a_m_k_fp32,
+    {
-                               b_k_n_fp32,
+        // Arrange
-                               c_m_n_host_fp32,
+        ck::gemm_util::GemmParams params;
-                               c_m_n_device_fp32);
+        params.M       = 1024;
-    }
+        params.N       = 1024;
+        params.K       = 1024;
-    auto operator()(DeviceGemmPtr_& gemmPtr)
+        params.StrideA = 1024;
-    {
+        params.StrideB = 1024;
-        // Arrange
+        params.StrideC = 1024;
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
+        auto host_tensors            = PrepareGemmTensorBF16(params);
-        params.N       = 1024;
+        const Tensor<BF16>& a_bf16   = std::get<0>(host_tensors);
-        params.K       = 1024;
+        const Tensor<BF16>& b_bf16   = std::get<1>(host_tensors);
-        params.StrideA = 1024;
+        Tensor<BF16>& c_device_bf16  = std::get<2>(host_tensors);
-        params.StrideB = 1024;
+        Tensor<float>& a_fp32        = std::get<3>(host_tensors);
-        params.StrideC = 1024;
+        Tensor<float>& b_fp32        = std::get<4>(host_tensors);
+        Tensor<float>& c_host_fp32   = std::get<5>(host_tensors);
-        auto host_tensors            = PrepareGemmTensorBF16(params);
+        Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
-        const Tensor<BF16>& a_bf16   = std::get<0>(host_tensors);
-        const Tensor<BF16>& b_bf16   = std::get<1>(host_tensors);
+        auto a_element_op = AElementwiseOperation{};
-        Tensor<BF16>& c_device_bf16  = std::get<2>(host_tensors);
+        auto b_element_op = BElementwiseOperation{};
-        Tensor<float>& a_fp32        = std::get<3>(host_tensors);
+        auto c_element_op = CElementwiseOperation{};
-        Tensor<float>& b_fp32        = std::get<4>(host_tensors);
-        Tensor<float>& c_host_fp32   = std::get<5>(host_tensors);
+        // use fp32 host kernel to verify bf16 device kernel
-        Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<float,
-        auto a_element_op = AElementwiseOperation{};
+                                                      float,
-        auto b_element_op = BElementwiseOperation{};
+                                                      float,
-        auto c_element_op = CElementwiseOperation{};
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
-        // use fp32 host kernel to verify bf16 device kernel
+                                                      CElementwiseOperation>;
-        using ReferenceGemmInstance =
+        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            ck::tensor_operation::host::ReferenceGemm<float,
+            a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
-                                                      float,
-                                                      float,
+        // Act
-                                                      AElementwiseOperation,
+        ck::gemm_util::RunDeviceGEMM(gemmPtr,
-                                                      BElementwiseOperation,
+                                     params,
-                                                      CElementwiseOperation>;
+                                     a_bf16,
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
+                                     b_bf16,
-            a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
+                                     c_device_bf16,
+                                     a_element_op,
-        // Act
+                                     b_element_op,
-        ck::gemm_util::RunDeviceGEMM(gemmPtr,
+                                     c_element_op);
-                                     params,
-                                     a_bf16,
+        bf16_to_f32_(c_device_bf16, c_device_fp32);
-                                     b_bf16,
-                                     c_device_bf16,
+        // Assert
-                                     a_element_op,
+        bool res = ck::utils::check_err(
-                                     b_element_op,
+            c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
-                                     c_element_op);
+        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        bf16_to_f32_(c_device_bf16, c_device_fp32);
+        return res;
+    };
-        // Assert
+};
-        bool res = ck::utils::check_err(
-            c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
+} // namespace gemm_util
-        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+} // namespace ck
+#endif
-        return res;
-    };
-};
-} // namespace gemm_util
-} // namespace ck
-#endif
--- a/test/reference_conv_fwd/CMakeLists.txt
+++ b/test/reference_conv_fwd/CMakeLists.txt
-add_test_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
+add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
 target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util)
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -4,6 +4,7 @@
 #include <numeric>
 #include <type_traits>
 #include <vector>
+#include "gtest/gtest.h"
 #include "check_err.hpp"
 #include "config.hpp"
@@ -82,13 +83,13 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
                                              OutElementOp{});
    ref_invoker.Run(ref_argument);
-    // std::cout <<"output: " << host_output.mDesc << std::endl << host_output.mData << std::endl;
    return host_output;
 }
-bool test_conv2d_nhwc()
+} // anonymous namespace
+TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
    params.N                      = 1;
    params.K                      = 1;
@@ -118,11 +119,14 @@ bool test_conv2d_nhwc()
                                472.5,
                                490.5,
                                508.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(
-                                      ref_dims,
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-                                      "Error: wrong output tensor dimensions!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+}
+TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParams params;
    params.N                      = 1;
    params.K                      = 2;
    params.C                      = 2;
@@ -133,25 +137,21 @@ bool test_conv2d_nhwc()
    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
-    out_tensor = run_reference_convolution_forward<2>(params);
+    auto out_tensor                   = run_reference_convolution_forward<2>(params);
-    ref_dims   = std::vector<std::size_t>{1, 2, 5, 5};
+    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
-    ref_data   = std::vector<float>{
+    std::vector<float> ref_data{
        210.,  210.,  327.,   327.,   351.,   351.,   375.,   375.,   399.,   399.,
        459.,  459.,  706.5,  706.5,  742.5,  742.5,  778.5,  778.5,  814.5,  814.5,
        747.,  747.,  1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
        1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(
-                                      ref_dims,
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-                                      "Error: wrong output tensor dimensions!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
-    return res;
 }
-bool test_conv1d_nwc()
+TEST(ReferenceConvolutionFWD, Conv1DNWC)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 1;
    params.N                      = 1;
@@ -174,11 +174,14 @@ bool test_conv1d_nwc()
                                          ck::tensor_layout::convolution::NWK>(params);
    std::vector<std::size_t> ref_dims{1, 1, 4};
    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(
-                                      ref_dims,
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-                                      "Error: wrong output tensor dimensions!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+}
+TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 1;
    params.N                      = 1;
    params.K                      = 2;
@@ -190,20 +193,24 @@ bool test_conv1d_nwc()
    params.input_left_pads        = std::vector<ck::index_t>{1};
    params.input_right_pads       = std::vector<ck::index_t>{1};
-    out_tensor = run_reference_convolution_forward<1,
+    auto out_tensor =
-                                                   float,
+        run_reference_convolution_forward<1,
-                                                   float,
+                                          float,
-                                                   float,
+                                          float,
-                                                   ck::tensor_layout::convolution::NWC,
+                                          float,
-                                                   ck::tensor_layout::convolution::KXC,
+                                          ck::tensor_layout::convolution::NWC,
-                                                   ck::tensor_layout::convolution::NWK>(params);
+                                          ck::tensor_layout::convolution::KXC,
-    ref_dims   = std::vector<std::size_t>{1, 2, 5};
+                                          ck::tensor_layout::convolution::NWK>(params);
-    ref_data   = std::vector<float>{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
+    std::vector<std::size_t> ref_dims{1, 2, 5};
-    res        = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
-                                      ref_dims,
+    EXPECT_TRUE(ck::utils::check_err(
-                                      "Error: wrong output tensor dimensions!");
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-    res = res && ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!"));
+}
+TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
+{
+    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 1;
    params.N                      = 2;
    params.K                      = 16;
@@ -224,8 +231,8 @@ bool test_conv1d_nwc()
                                                         ck::tensor_layout::convolution::NWK>(
        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    ref_dims = std::vector<std::size_t>{2, 16, 16};
+    std::vector<std::size_t> ref_dims{2, 16, 16};
-    ref_data = std::vector<float>{
+    std::vector<float> ref_data{
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
@@ -290,17 +297,13 @@ bool test_conv1d_nwc()
        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,
        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
-    res = res && ck::utils::check_err(out_tensor2.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(
-                                      ref_dims,
+        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
-                                      "Error: wrong output tensor dimensions!");
+    EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!"));
-    res = res && ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
-    return res;
 }
-bool test_conv3d_ncdhw()
+TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 {
-    bool res{true};
    ck::utils::conv::ConvParams params;
    params.num_dim_spatial        = 3;
    params.N                      = 1;
@@ -331,12 +334,17 @@ bool test_conv3d_ncdhw()
        634.5,     637.2,     639.9,     642.60004, 650.7,     653.4,     656.10004, 658.8,
        699.3,     702.,      704.7,     707.4,     715.5,     718.2,     720.9,     723.60004,
        731.7,     734.4001,  737.10004, 739.8,     747.9001,  750.60004, 753.3,     756.};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
+                                     ref_dims,
-                                      "Error [case 1]: wrong output tensor dimensions!");
+                                     "Error [case 1]: wrong output tensor dimensions!"));
-    res = res &&
+    EXPECT_TRUE(
-          ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!");
+        ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!"));
+}
+TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations)
+{
+    ck::utils::conv::ConvParams params;
+    params.num_dim_spatial        = 3;
    params.N                      = 1;
    params.K                      = 2;
    params.C                      = 2;
@@ -347,16 +355,16 @@ bool test_conv3d_ncdhw()
    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
-    out_tensor = run_reference_convolution_forward<3,
+    auto out_tensor = run_reference_convolution_forward<3,
-                                                   float,
+                                                        float,
-                                                   float,
+                                                        float,
-                                                   float,
+                                                        float,
-                                                   ck::tensor_layout::convolution::NCDHW,
+                                                        ck::tensor_layout::convolution::NCDHW,
-                                                   ck::tensor_layout::convolution::KCZYX,
+                                                        ck::tensor_layout::convolution::KCZYX,
-                                                   ck::tensor_layout::convolution::NKDHW>(
+                                                        ck::tensor_layout::convolution::NKDHW>(
        params, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
-    ref_dims = std::vector<std::size_t>{1, 2, 4, 4, 4};
+    std::vector<std::size_t> ref_dims{1, 2, 4, 4, 4};
-    ref_data = std::vector<float>{
+    std::vector<float> ref_data{
        2756.7002, 2764.7998, 2772.9001, 2781.,     2853.9001, 2862.,     2870.1,    2878.2002,
        2951.1,    2959.2002, 2967.2998, 2975.4001, 3048.2998, 3056.4001, 3064.5,    3072.6,
        3923.1,    3931.2,    3939.2998, 3947.4,    4020.2998, 4028.4001, 4036.5002, 4044.5999,
@@ -373,26 +381,9 @@ bool test_conv3d_ncdhw()
        5283.9004, 5292.,     5300.0996, 5308.2,    5381.0996, 5389.2,    5397.3,    5405.4004,
        6255.9004, 6264.0005, 6272.1,    6280.2,    6353.1,    6361.2,    6369.301,  6377.4,
        6450.301,  6458.4,    6466.5,    6474.6,    6547.5,    6555.6,    6563.699,  6571.801};
-    res = res && ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
-                                      ref_dims,
+                                     ref_dims,
-                                      "Error [case 2]: wrong output tensor dimensions!");
+                                     "Error [case 2]: wrong output tensor dimensions!"));
-    res =
+    EXPECT_TRUE(ck::utils::check_err(
-        res && ck::utils::check_err(
+        out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
-                   out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f);
-    return res;
-}
-} // anonymous namespace
-int main(void)
-{
-    bool res{true};
-    res = test_conv2d_nhwc();
-    std::cout << "test_conv2d_nhwc ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv1d_nwc();
-    std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    res = test_conv3d_ncdhw();
-    std::cout << "test_conv3d_ncdhw ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
-    return res ? 0 : 1;
 }