1. Rename AccDatatype in normalization to computeData

2. Rename AccElementwiseOperation to YElementwiseOperation in normalization

1. Rename AccDatatype in normalization to computeData
2. Rename AccElementwiseOperation to YElementwiseOperation in normalization
9d2280d6 · rocking · 1a38e362 · 9d2280d6 · 9d2280d6 · 9d2280d6
Commit 9d2280d6 authored Feb 10, 2023 by rocking
13 changed files
--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -12,12 +12,12 @@

 #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"

-using XDataType     = ck::half_t;
-using GammaDataType = ck::half_t;
-using BetaDataType  = ck::half_t;
-using YDataType     = ck::half_t;
-using AccDataType   = float;
-using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;

 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
@@ -54,7 +54,7 @@ int main(int argc, char* argv[])
    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                                       GammaDataType,
                                                                       BetaDataType,
-                                                                       AccDataType,
+                                                                       ComputeDataType,
                                                                       YDataType,
                                                                       PassThrough,
                                                                       Rank,

--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -20,12 +20,12 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"

-using XDataType     = ck::half_t;
-using GammaDataType = ck::half_t;
-using BetaDataType  = ck::half_t;
-using YDataType     = ck::half_t;
-using AccDataType   = float;
-using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ConputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;

 constexpr int Rank         = 2;
 constexpr int NumReduceDim = 1;
@@ -34,7 +34,7 @@ using DeviceInstance =
    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
                                                          GammaDataType,
                                                          BetaDataType,
-                                                          AccDataType,
+                                                          ConputeDataType,
                                                          YDataType,
                                                          PassThrough,
                                                          Rank,
@@ -121,7 +121,7 @@ int main()
                                                                                 GammaDataType,
                                                                                 BetaDataType,
                                                                                 YDataType,
-                                                                                 AccDataType,
+                                                                                 ConputeDataType,
                                                                                 PassThrough,
                                                                                 Rank,
                                                                                 NumReduceDim>;

--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
@@ -23,11 +23,11 @@
 constexpr int Rank         = 5;
 constexpr int NumReduceDim = 3;

-using XDataType     = ck::half_t;
-using GammaDataType = ck::half_t;
-using BetaDataType  = ck::half_t;
-using YDataType     = ck::half_t;
-using AccDataType   = float;
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ConputeDataType = float;

 struct YElementOp
 {
@@ -50,7 +50,7 @@ using DeviceInstance =
    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
                                                          GammaDataType,
                                                          BetaDataType,
-                                                          AccDataType,
+                                                          ConputeDataType,
                                                          YDataType,
                                                          YElementOp,
                                                          Rank,
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
                                                                                 GammaDataType,
                                                                                 BetaDataType,
                                                                                 YDataType,
-                                                                                 AccDataType,
+                                                                                 ConputeDataType,
                                                                                 YElementOp>;

        ReferenceInstance ref;

--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -14,9 +14,9 @@ namespace device {
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename AccDataType,
+          typename ConputeDataType,
          typename YDataType,
-          typename AccElementwiseOperation,
+          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
 struct DeviceNormalization : public BaseOperator
@@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator
                        void* p_y,
                        void* p_savedMean,
                        void* p_savedInvVar,
-                        AccElementwiseOperation acc_elementwise_op) = 0;
+                        YElementwiseOperation y_elementwise_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
@@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename AccDataType,
+          typename ConputeDataType,
          typename YDataType,
-          typename AccElementwiseOperation,
+          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
 using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
                                                                   GammaDataType,
                                                                   BetaDataType,
-                                                                   AccDataType,
+                                                                   ConputeDataType,
                                                                   YDataType,
-                                                                   AccElementwiseOperation,
+                                                                   YElementwiseOperation,
                                                                   Rank,
                                                                   NumReduceDim>>;


--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -21,20 +21,20 @@ template <typename GridwiseReduction,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
-          typename AccElementwiseOperation,
+          typename ConputeDataType,
+          typename YElementwiseOperation,
          typename GridDesc_M_K>
 __global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
                                     const GridDesc_M_K gamma_grid_desc_m_k,
                                     const GridDesc_M_K beta_grid_desc_m_k,
                                     const GridDesc_M_K y_grid_desc_m_k,
                                     index_t num_k_block_tile_iteration,
-                                     AccDataType epsilon,
+                                     ConputeDataType epsilon,
                                     const XDataType* const __restrict__ p_x_global,
                                     const GammaDataType* const __restrict__ p_gamma_global,
                                     const BetaDataType* const __restrict__ p_beta_global,
                                     YDataType* const __restrict__ p_y_global,
-                                     const AccElementwiseOperation acc_elementwise_op)
+                                     const YElementwiseOperation y_elementwise_op)
 {
    GridwiseReduction::Run(x_grid_desc_m_k,
                           gamma_grid_desc_m_k,
@@ -46,7 +46,7 @@ __global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
                           p_gamma_global,
                           p_beta_global,
                           p_y_global,
-                           acc_elementwise_op);
+                           y_elementwise_op);
 };
 } // namespace ck

@@ -58,9 +58,9 @@ namespace device {
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename AccDataType,
+          typename ConputeDataType,
          typename YDataType,
-          typename AccElementwiseOperation,
+          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim,
          index_t BlockSize,
@@ -78,9 +78,9 @@ template <typename XDataType,
 struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                            GammaDataType,
                                                            BetaDataType,
-                                                            AccDataType,
+                                                            ConputeDataType,
                                                            YDataType,
-                                                            AccElementwiseOperation,
+                                                            YElementwiseOperation,
                                                            Rank,
                                                            NumReduceDim>
 {
@@ -172,8 +172,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
-                                                      AccDataType,
-                                                      AccElementwiseOperation,
+                                                      ConputeDataType,
+                                                      YElementwiseOperation,
                                                      GridDesc_M_K,
                                                      BlockSize,
                                                      MThreadClusterSize,
@@ -194,8 +194,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
-                                                      AccDataType,
-                                                      AccElementwiseOperation,
+                                                      ConputeDataType,
+                                                      YElementwiseOperation,
                                                      GridDesc_M_K,
                                                      BlockSize,
                                                      MThreadClusterSize,
@@ -220,7 +220,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                 const std::vector<index_t> betaStrides,
                 const std::vector<index_t> yStrides,
                 const std::vector<index_t> reduceDims,
-                 AccElementwiseOperation acc_elementwise_op,
+                 YElementwiseOperation y_elementwise_op,
                 double epsilon,
                 const XDataType* p_x,
                 const GammaDataType* p_gamma,
@@ -230,9 +230,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
              p_gamma_(p_gamma),
              p_beta_(p_beta),
              p_y_(p_y),
-              acc_elementwise_op_(acc_elementwise_op)
+              y_elementwise_op_(y_elementwise_op)
        {
-            epsilon_ = static_cast<AccDataType>(epsilon);
+            epsilon_ = static_cast<ConputeDataType>(epsilon);

            Lengths_      = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
            xStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
@@ -265,7 +265,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
        }

-        AccDataType epsilon_;
+        ConputeDataType epsilon_;

        const XDataType* p_x_;
        const GammaDataType* p_gamma_;
@@ -278,7 +278,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        std::vector<index_t> betaStrides_;
        std::vector<index_t> yStrides_;

-        AccElementwiseOperation acc_elementwise_op_;
+        YElementwiseOperation y_elementwise_op_;

        int blkGroupSize_;
        int numBlockTileIteration_;
@@ -301,16 +301,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
                                                                YDataType,
-                                                                AccDataType,
-                                                                AccElementwiseOperation,
+                                                                ConputeDataType,
+                                                                YElementwiseOperation,
                                                                GridDesc_M_K>
                                         : kernel_normalization<GridwiseReduceLayernormGeneric,
                                                                XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
                                                                YDataType,
-                                                                AccDataType,
-                                                                AccElementwiseOperation,
+                                                                ConputeDataType,
+                                                                YElementwiseOperation,
                                                                GridDesc_M_K>;

            float avg_time = 0;
@@ -329,7 +329,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                               arg.p_gamma_,
                                               arg.p_beta_,
                                               arg.p_y_,
-                                               arg.acc_elementwise_op_);
+                                               arg.y_elementwise_op_);

            return (avg_time);
        };
@@ -429,7 +429,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                        void* p_y,
                        void* p_saveMean,
                        void* p_saveInvVar,
-                        AccElementwiseOperation acc_elementwise_op) override
+                        YElementwiseOperation y_elementwise_op) override
    {
        // TODO
        // Optional cache of the intermediate results (mean and InvVariance) during the
@@ -443,7 +443,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                          betaStrides,
                                          yStrides,
                                          reduceDims,
-                                          acc_elementwise_op,
+                                          y_elementwise_op,
                                          epsilon,
                                          static_cast<const XDataType*>(p_x),
                                          static_cast<const GammaDataType*>(p_gamma),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
@@ -16,8 +16,8 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
-          typename AccDataType,
-          typename AccElementwiseOperation,
+          typename ComputeDataType,
+          typename YElementwiseOperation,
          typename GridDesc_M_K,
          index_t BlockSize,
          index_t MThreadClusterSize,
@@ -70,9 +70,9 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));

    using ThreadwiseWelford =
-        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+        ThreadwiseWelford<ComputeDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;

-    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+    using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
                                              BlockSize,
                                              ThreadClusterLengths_M_K,
                                              ThreadClusterArrangeOrder>;
@@ -115,12 +115,12 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
                               const GridDesc_M_K& beta_grid_desc_m_k,
                               const GridDesc_M_K& y_grid_desc_m_k,
                               index_t num_k_block_tile_iteration,
-                               AccDataType epsilon,
+                               ComputeDataType epsilon,
                               const XDataType* const __restrict__ p_x_global,
                               const GammaDataType* const __restrict__ p_gamma_global,
                               const BetaDataType* const __restrict__ p_beta_global,
                               YDataType* const __restrict__ p_y_global,
-                               const AccElementwiseOperation acc_elementwise_op)
+                               const YElementwiseOperation y_elementwise_op)
    {
        if constexpr(SweepOnce)
        {
@@ -133,7 +133,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        auto x_thread_buf = generate_tuple(
            [&](auto) {
                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    AccDataType,
+                                    ComputeDataType,
                                    MThreadSliceSize * XSrcVectorSize,
                                    true>{};
            },
@@ -142,7 +142,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        auto gamma_thread_buf = generate_tuple(
            [&](auto) {
                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    AccDataType,
+                                    ComputeDataType,
                                    MThreadSliceSize * GammaSrcVectorSize,
                                    true>{};
            },
@@ -151,7 +151,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        auto beta_thread_buf = generate_tuple(
            [&](auto) {
                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    AccDataType,
+                                    ComputeDataType,
                                    MThreadSliceSize * BetaSrcVectorSize,
                                    true>{};
            },
@@ -160,14 +160,16 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        auto y_thread_buf = generate_tuple(
            [&](auto) {
                return StaticBuffer<AddressSpaceEnum::Vgpr,
-                                    AccDataType,
+                                    ComputeDataType,
                                    MThreadSliceSize * YDstVectorSize,
                                    true>{};
            },
            Number<ThreadBufferNumber>{});

-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            var_thread_buf;

        const index_t thread_local_id = get_thread_local_1d_id();
        const index_t block_global_id = get_block_1d_id();
@@ -179,7 +181,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        const auto thread_k_cluster_id = thread_cluster_idx[I1];

        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
-                                                                  AccDataType,
+                                                                  ComputeDataType,
                                                                  GridDesc_M_K,
                                                                  decltype(thread_buffer_desc_m_k),
                                                                  ThreadBufferLengths_M_K,
@@ -195,7 +197,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk

        auto threadwise_gamma_load =
            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
-                                             AccDataType,
+                                             ComputeDataType,
                                             GridDesc_M_K,
                                             decltype(thread_buffer_desc_m_k),
                                             ThreadBufferLengths_M_K,
@@ -211,7 +213,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk

        auto threadwise_beta_load =
            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
-                                             AccDataType,
+                                             ComputeDataType,
                                             GridDesc_M_K,
                                             decltype(thread_buffer_desc_m_k),
                                             ThreadBufferLengths_M_K,
@@ -226,11 +228,11 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
                                 thread_k_cluster_id * BetaSrcVectorSize));

        auto threadwise_y_store =
-            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
                                               YDataType,
                                               decltype(thread_buffer_desc_m_k),
                                               GridDesc_M_K,
-                                               AccElementwiseOperation,
+                                               YElementwiseOperation,
                                               ThreadBufferLengths_M_K,
                                               ThreadBufferDimAccessOrder,
                                               YDstVectorDim,
@@ -242,7 +244,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
                make_multi_index(block_global_id * M_BlockTileSize +
                                     thread_m_cluster_id * MThreadSliceSize,
                                 thread_k_cluster_id * YDstVectorSize),
-                acc_elementwise_op);
+                y_elementwise_op);

        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
        constexpr auto thread_copy_bwd_step_m_k =
@@ -261,8 +263,8 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
        threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id);

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
-            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+            mean_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<ComputeDataType>(0.0f);
        });

        // Separate sweep once and sweep twice pipeline

--- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
@@ -21,7 +21,7 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
 // clang-format off
 using device_normalization_f16_instances =
    std::tuple <
-        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size

--- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
@@ -19,7 +19,7 @@ using Pass = ck::tensor_operation::element_wise::PassThrough;
 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_layernorm_f32_instances = std::tuple<
    // clang-format off
-        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
+        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size

--- a/profiler/include/profiler/profile_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_impl.hpp
@@ -19,7 +19,7 @@ namespace profiler {
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename AccDataType,
+          typename ComputeDataType,
          typename YDataType,
          index_t Rank>
 bool profile_layernorm_impl(int do_verification,
@@ -86,7 +86,7 @@ bool profile_layernorm_impl(int do_verification,
    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
                                                                       GammaDataType,
                                                                       BetaDataType,
-                                                                       AccDataType,
+                                                                       ComputeDataType,
                                                                       YDataType,
                                                                       PassThrough,
                                                                       Rank,
@@ -109,7 +109,7 @@ bool profile_layernorm_impl(int do_verification,
                                                                                 GammaDataType,
                                                                                 BetaDataType,
                                                                                 YDataType,
-                                                                                 AccDataType,
+                                                                                 ComputeDataType,
                                                                                 PassThrough,
                                                                                 Rank,
                                                                                 NumReduceDim>;

--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -12,11 +12,11 @@ template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
    protected:
-    using XDataType     = std::tuple_element_t<0, Tuple>;
-    using GammaDataType = std::tuple_element_t<1, Tuple>;
-    using BetaDataType  = std::tuple_element_t<2, Tuple>;
-    using AccDataType   = std::tuple_element_t<3, Tuple>;
-    using YDataType     = std::tuple_element_t<4, Tuple>;
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using GammaDataType   = std::tuple_element_t<1, Tuple>;
+    using BetaDataType    = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType = std::tuple_element_t<3, Tuple>;
+    using YDataType       = std::tuple_element_t<4, Tuple>;

    void Run()
    {
@@ -36,7 +36,7 @@ class TestGroupnorm : public ::testing::Test
                ck::profiler::profile_groupnorm_impl<XDataType,
                                                     GammaDataType,
                                                     BetaDataType,
-                                                     AccDataType,
+                                                     ComputeDataType,
                                                     YDataType>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
@@ -44,7 +44,7 @@ class TestGroupnorm : public ::testing::Test
 };

 using KernelTypes = ::testing::Types<
-    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
    std::tuple<F16, F16, F16, F32, F16>>;

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);

--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -12,11 +12,11 @@ template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
    protected:
-    using XDataType     = std::tuple_element_t<0, Tuple>;
-    using GammaDataType = std::tuple_element_t<1, Tuple>;
-    using BetaDataType  = std::tuple_element_t<2, Tuple>;
-    using AccDataType   = std::tuple_element_t<3, Tuple>;
-    using YDataType     = std::tuple_element_t<4, Tuple>;
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using GammaDataType   = std::tuple_element_t<1, Tuple>;
+    using BetaDataType    = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType = std::tuple_element_t<3, Tuple>;
+    using YDataType       = std::tuple_element_t<4, Tuple>;

    void Run()
    {
@@ -34,7 +34,7 @@ class TestGroupnorm : public ::testing::Test
                ck::profiler::profile_groupnorm_impl<XDataType,
                                                     GammaDataType,
                                                     BetaDataType,
-                                                     AccDataType,
+                                                     ComputeDataType,
                                                     YDataType>(true, 2, false, false, length);
            EXPECT_TRUE(success);
        }
@@ -42,7 +42,7 @@ class TestGroupnorm : public ::testing::Test
 };

 using KernelTypes = ::testing::Types<
-    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
    std::tuple<F32, F32, F32, F32, F32>>;

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);

--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -12,11 +12,11 @@ template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
    protected:
-    using XDataType     = std::tuple_element_t<0, Tuple>;
-    using GammaDataType = std::tuple_element_t<1, Tuple>;
-    using BetaDataType  = std::tuple_element_t<2, Tuple>;
-    using AccDataType   = std::tuple_element_t<3, Tuple>;
-    using YDataType     = std::tuple_element_t<4, Tuple>;
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using GammaDataType   = std::tuple_element_t<1, Tuple>;
+    using BetaDataType    = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType = std::tuple_element_t<3, Tuple>;
+    using YDataType       = std::tuple_element_t<4, Tuple>;

    void Run()
    {
@@ -29,7 +29,7 @@ class TestLayernorm2d : public ::testing::Test
            bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
-                                                                AccDataType,
+                                                                ComputeDataType,
                                                                YDataType,
                                                                2>(true, 2, false, false, length);
            EXPECT_TRUE(success);
@@ -38,7 +38,7 @@ class TestLayernorm2d : public ::testing::Test
 };

 using KernelTypes = ::testing::Types<
-    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
    std::tuple<F16, F16, F16, F32, F16>>;

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);

--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -12,11 +12,11 @@ template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
    protected:
-    using XDataType     = std::tuple_element_t<0, Tuple>;
-    using GammaDataType = std::tuple_element_t<1, Tuple>;
-    using BetaDataType  = std::tuple_element_t<2, Tuple>;
-    using AccDataType   = std::tuple_element_t<3, Tuple>;
-    using YDataType     = std::tuple_element_t<4, Tuple>;
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using GammaDataType   = std::tuple_element_t<1, Tuple>;
+    using BetaDataType    = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType = std::tuple_element_t<3, Tuple>;
+    using YDataType       = std::tuple_element_t<4, Tuple>;

    void Run()
    {
@@ -29,7 +29,7 @@ class TestLayernorm2d : public ::testing::Test
            bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
-                                                                AccDataType,
+                                                                ComputeDataType,
                                                                YDataType,
                                                                2>(true, 2, false, false, length);
            EXPECT_TRUE(success);
@@ -38,7 +38,7 @@ class TestLayernorm2d : public ::testing::Test
 };

 using KernelTypes = ::testing::Types<
-    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
    std::tuple<F32, F32, F32, F32, F32>>;

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);