[What] Remove tuple type in the base class

[Why] External api depend on base class. if base class has relationship with type, we will need many class for different type

[What] Remove tuple type in the base class
[Why] External api depend on base class. if base class has relationship with type, we will need many class for different type
a76eac28 · rocking · f9d22b02 · a76eac28 · a76eac28 · a76eac28
Commit a76eac28 authored Jun 15, 2022 by rocking
19 changed files
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -163,8 +163,7 @@ template <typename ALayout,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceBatchedGemmReduce_Xdl_CShuffle
-    : public DeviceGemmReduce<DPtrsGlobal,
-                              AElementwiseOperation,
+    : public DeviceGemmReduce<AElementwiseOperation,
                              BElementwiseOperation,
                              CElementwiseOperation,
                              DxsInElementwiseOperation,
@@ -861,7 +860,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        index_t MRaw,
                        index_t NRaw,
                        index_t KRaw,
@@ -875,10 +874,11 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle
                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        index_t BatchCount) override
    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          p_dxs,
+                                          dxs_tuple,
                                          MRaw,
                                          NRaw,
                                          KRaw,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
@@ -71,8 +71,7 @@ template <typename ALayout,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceGemmBiasAddReduce_Xdl_CShuffle
-    : public DeviceGemmBiasAddReduce<DPtrsGlobal,
-                                     AElementwiseOperation,
+    : public DeviceGemmBiasAddReduce<AElementwiseOperation,
                                     BElementwiseOperation,
                                     CElementwiseOperation,
                                     C1ElementwiseOperation,
@@ -744,7 +743,7 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle
                        void* p_c,
                        const void* p_c0,
                        const void* p_c1,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        index_t MRaw,
                        index_t NRaw,
                        index_t KRaw,
@@ -760,12 +759,13 @@ struct DeviceGemmBiasAddReduce_Xdl_CShuffle
                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        index_t /* KBatch */ = 1) override
    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
                                          static_cast<const C0DataType*>(p_c0),
                                          static_cast<const C1DataType*>(p_c1),
-                                          p_dxs,
+                                          dxs_tuple,
                                          MRaw,
                                          NRaw,
                                          KRaw,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -6,8 +6,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <typename DPtrsGlobal,
-          typename AElementwiseOperation,
+template <typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
@@ -18,7 +17,7 @@ struct DeviceGemmReduce : public BaseOperator
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        ck::index_t M,
                        ck::index_t N,
                        ck::index_t K,
@@ -35,21 +34,18 @@ struct DeviceGemmReduce : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <typename DPtrsGlobal,
-          typename AElementwiseOperation,
+template <typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
          typename DxsReduceAccElementwiseOperation>
-using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
-                                                             AElementwiseOperation,
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
                                                             DxsInElementwiseOperation,
                                                             DxsReduceAccElementwiseOperation>>;

-template <typename DPtrsGlobal,
-          typename AElementwiseOperation,
+template <typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename C1ElementwiseOperation,
@@ -63,7 +59,7 @@ struct DeviceGemmBiasAddReduce : public BaseOperator
                        void* p_c,
                        const void* p_c0,
                        const void* p_c1,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        ck::index_t M,
                        ck::index_t N,
                        ck::index_t K,
@@ -82,16 +78,14 @@ struct DeviceGemmBiasAddReduce : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

-template <typename DPtrsGlobal,
-          typename AElementwiseOperation,
+template <typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename C1ElementwiseOperation,
          typename DxsInElementwiseOperation,
          typename DxsReduceAccElementwiseOperation>
 using DeviceGemmBiasAddReducePtr =
-    std::unique_ptr<DeviceGemmBiasAddReduce<DPtrsGlobal,
-                                            AElementwiseOperation,
+    std::unique_ptr<DeviceGemmBiasAddReduce<AElementwiseOperation,
                                            BElementwiseOperation,
                                            CElementwiseOperation,
                                            C1ElementwiseOperation,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -68,8 +68,7 @@ template <typename ALayout,
          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
-                                                               AElementwiseOperation,
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
                                                               DxsInElementwiseOperation,
@@ -695,7 +694,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        index_t MRaw,
                        index_t NRaw,
                        index_t KRaw,
@@ -709,10 +708,11 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        index_t /* KBatch */ = 1) override
    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          p_dxs,
+                                          dxs_tuple,
                                          MRaw,
                                          NRaw,
                                          KRaw,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -62,12 +62,9 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -62,12 +62,9 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -62,12 +62,9 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -59,12 +59,9 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_in
        >;

 void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances,

--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -63,8 +63,7 @@ using device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn
        >;

 void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
-    std::vector<DeviceGemmBiasAddReducePtr<DPtrsGlobal,
-                                           PassThrough,
+    std::vector<DeviceGemmBiasAddReducePtr<PassThrough,
                                           PassThrough,
                                           PassThrough,
                                           PassThrough,

--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -63,8 +63,7 @@ using device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk
        >;

 void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
-    std::vector<DeviceGemmBiasAddReducePtr<DPtrsGlobal,
-                                           PassThrough,
+    std::vector<DeviceGemmBiasAddReducePtr<PassThrough,
                                           PassThrough,
                                           PassThrough,
                                           PassThrough,

--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -63,8 +63,7 @@ using device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn
        >;

 void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
-    std::vector<DeviceGemmBiasAddReducePtr<DPtrsGlobal,
-                                           PassThrough,
+    std::vector<DeviceGemmBiasAddReducePtr<PassThrough,
                                           PassThrough,
                                           PassThrough,
                                           PassThrough,

--- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -60,8 +60,7 @@ using device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk
        >;

 void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
-    std::vector<DeviceGemmBiasAddReducePtr<DPtrsGlobal,
-                                           PassThrough,
+    std::vector<DeviceGemmBiasAddReducePtr<PassThrough,
                                           PassThrough,
                                           PassThrough,
                                           PassThrough,

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -62,12 +62,9 @@ using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances = s
    >;

 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -62,12 +62,9 @@ using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances = s
    >;

 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -62,12 +62,9 @@ using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances = s
    >;

 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances{});

--- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -59,12 +59,9 @@ using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances = s
    >;

 void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances(
-    std::vector<DeviceGemmReducePtr<DPtrsGlobal,
-                                    PassThrough,
-                                    PassThrough,
-                                    PassThrough,
-                                    DInElementOps,
-                                    DOutElementOps>>& instances)
+    std::vector<
+        DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>&
+        instances)
 {
    add_device_operation_instances(
        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances{});

--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -26,7 +26,6 @@ using DInElementOps  = ck::Tuple<Identity, Square>;
 using DOutElementOps = ck::Tuple<Identity, Identity>;

 using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePtr<
-    DPtrsGlobal,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
@@ -260,7 +259,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
            gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                          static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                          static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                          dxs_global,
+                                          &dxs_global,
                                          M,
                                          N,
                                          K,

--- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
@@ -27,7 +27,6 @@ using DInElementOps  = ck::Tuple<Identity, Square>;
 using DOutElementOps = ck::Tuple<Div, Div>;

 using DeviceGemmBiasAddReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmBiasAddReducePtr<
-    DPtrsGlobal,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
@@ -296,7 +295,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
            static_cast<C0DataType*>(bias_device_buf.GetDeviceBuffer()),
            static_cast<C1DataType*>(c1_device_buf.GetDeviceBuffer()),
-            dxs_global,
+            &dxs_global,
            M,
            N,
            K,

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -27,7 +27,6 @@ using DInElementOps  = ck::Tuple<Identity, Square>;
 using DOutElementOps = ck::Tuple<Div, Div>;

 using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePtr<
-    DPtrsGlobal,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
@@ -261,7 +260,7 @@ bool profile_gemm_reduce_impl(int do_verification,
            gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                          static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                          static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                          dxs_global,
+                                          &dxs_global,
                                          M,
                                          N,
                                          K,