Merge pull request #33 from ROCm/lwpck-1292

Merge from the public repo.

Merge pull request #33 from ROCm/lwpck-1292
Merge from the public repo.
29dcb956 · Illia Silin · GitHub · 29deceb6 · cbcc844e · 29dcb956
Unverified Commit 29dcb956 authored Feb 08, 2024 by Illia Silin Committed by GitHub Feb 08, 2024
20 changed files
--- a/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp
@@ -11,7 +11,7 @@ namespace instance {
 using Swish = ck::tensor_operation::element_wise::Swish;

 void add_device_normalization_fwd_rank_5_3_swish_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, Swish, 5, 3>>>&
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F16, Swish, 5, 3>>>&
        instances)
 {
    add_device_operation_instances(instances,

--- a/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp
@@ -11,7 +11,7 @@ namespace instance {
 using Pass = ck::tensor_operation::element_wise::PassThrough;

 void add_device_normalization_fwd_rank_2_1_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, Pass, 2, 1>>>&
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F16, Pass, 2, 1>>>&
        instances)
 {
    add_device_operation_instances(instances,

--- a/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp
@@ -11,7 +11,7 @@ namespace instance {
 using Pass = ck::tensor_operation::element_wise::PassThrough;

 void add_device_normalization_fwd_rank_4_3_f16_instances(
-    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F32, Pass, 4, 3>>>&
+    std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F16, F16, F16, F16, Pass, 4, 3>>>&
        instances)
 {
    add_device_operation_instances(instances,

--- a/library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp
@@ -23,24 +23,24 @@ using device_normalization_f16_instances =
    // clang-format off
    std::tuple <
        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector>
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1>,   // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1>,   // irregular size
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1>,   // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1>,   // irregular size
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>
        // clang-format on
        >;

@@ -49,31 +49,31 @@ using device_normalization_splitk_f16_instances =
    // clang-format off
    std::tuple <
        // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector>
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1>,   // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1>,   // irregular size
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
-        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>,   // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>, // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1>,   // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1>,   // irregular size
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1>,
+        DeviceNormalizationFwdSplitKImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1>
        // clang-format on
        >;

 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_normalization_f16_generic_instance = std::tuple<
    // clang-format off
-        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+        DeviceNormalizationFwdImpl<F16, F16, F16, F32, F16, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>
    // clang-format on
    >;


--- a/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+add_instance_library(device_permute_scale_instance 
+	device_permute_scale_instances.cpp)
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Pass    = ck::tensor_operation::element_wise::PassThrough;
+using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
+using Scale   = ck::tensor_operation::element_wise::Scale;
+
+// clang-format off
+using device_permute_scale_f16_instances =
+    std::tuple <
+        DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
+        DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
+        DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
+        DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
+    >;
+
+using device_permute_scale_f32_instances = std::tuple<
+         DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
+         DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
+         DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
+         DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
+    >;
+// clang-format on
+
+void add_device_permute_scale_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f16_instances{});
+}
+
+void add_device_permute_scale_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -22,13 +22,13 @@ using S = ck::Sequence<Is...>;
 using NHWGC       = ck::tensor_layout::convolution::NHWGC;
 using GKYXC       = ck::tensor_layout::convolution::GKYXC;
 using NHWGK       = ck::tensor_layout::convolution::NHWGK;
-using GK          = ck::tensor_layout::convolution::G_K;
+using G_K         = ck::tensor_layout::convolution::G_K;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Relu        = ck::tensor_operation::element_wise::Relu;
 using TanH        = ck::tensor_operation::element_wise::TanH;

-using GK_Tuple      = ck::Tuple<GK>;
-using GK_GK_Tuple   = ck::Tuple<GK, GK>;
+using GK_Tuple      = ck::Tuple<G_K>;
+using GK_GK_Tuple   = ck::Tuple<G_K, G_K>;
 using I32_Tuple     = ck::Tuple<int32_t>;
 using F32_Tuple     = ck::Tuple<float>;
 using I32_F32_Tuple = ck::Tuple<int32_t, float>;

--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
-set(DEVICE_SOFTMAX_INSTANCES)
-list(APPEND DEVICE_SOFTMAX_INSTANCES 
+add_instance_library(device_softmax_instance
    device_softmax_f16_f16_instance_rank3_reduce1.cpp
    device_softmax_f16_f16_instance_rank3_reduce2.cpp
    device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -14,4 +13,3 @@ list(APPEND DEVICE_SOFTMAX_INSTANCES
    device_softmax_f32_f32_instance_rank4_reduce2.cpp
    device_softmax_f32_f32_instance_rank4_reduce3.cpp
    device_softmax_f32_f32_instance_rank4_reduce4.cpp)
-add_instance_library(device_softmax_instance ${DEVICE_SOFTMAX_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/transpose/device_transpose_instances_3d.cpp
+++ b/library/src/tensor_operation_instance/gpu/transpose/device_transpose_instances_3d.cpp
@@ -19,22 +19,14 @@ void add_device_transpose_f16_instances(
    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 5>>>&
        instances)
 {
-#ifdef CK_ENABLE_FP16
    add_device_operation_instances(instances, device_transpose_f16_instances{});
-#else
-    ignore = instances;
-#endif
 }

 void add_device_transpose_f32_instances(
    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 5>>>&
        instances)
 {
-#ifdef CK_ENABLE_FP32
    add_device_operation_instances(instances, device_transpose_f32_instances{});
-#else
-    ignore = instances;
-#endif
 }

 } // namespace instance

--- a/library/src/utility/CMakeLists.txt
+++ b/library/src/utility/CMakeLists.txt
-## utility
-set(UTILITY_SOURCE
+add_library(utility STATIC
    device_memory.cpp
    host_tensor.cpp
    convolution_parameter.cpp
 )

-add_library(utility STATIC ${UTILITY_SOURCE})
 add_library(composable_kernel::utility ALIAS utility)
-
+set_target_properties(utility PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_options(utility PRIVATE ${CMAKE_COMPILER_WARNINGS})
 target_include_directories(utility PUBLIC
    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>"
 )
+if(WIN32)
+    target_compile_definitions(utility PUBLIC NOMINMAX)
+endif()

 rocm_install(
    TARGETS utility

--- a/profiler/include/profiler/profile_gemm_add_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout>
+bool profile_gemm_add_impl(int do_verification,
+                           int init_method,
+                           bool /*do_log*/,
+                           bool time_kernel,
+                           int M,
+                           int N,
+                           int K,
+                           int StrideA,
+                           int StrideB,
+                           int StrideD0,
+                           int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Add         = ck::tensor_operation::element_wise::Add;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = Add;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Add>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_add_relu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_relu_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout>
+bool profile_gemm_add_relu_impl(int do_verification,
+                                int init_method,
+                                bool /*do_log*/,
+                                bool time_kernel,
+                                int M,
+                                int N,
+                                int K,
+                                int StrideA,
+                                int StrideB,
+                                int StrideD0,
+                                int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = AddRelu;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddRelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_add_silu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_silu_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout>
+bool profile_gemm_add_silu_impl(int do_verification,
+                                int init_method,
+                                bool /*do_log*/,
+                                bool time_kernel,
+                                int M,
+                                int N,
+                                int K,
+                                int StrideA,
+                                int StrideB,
+                                int StrideD0,
+                                int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using AddRelu     = ck::tensor_operation::element_wise::AddSilu;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = AddRelu;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddSilu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -42,7 +42,9 @@ int profile_gemm_impl(int do_verification,
                      int K,
                      int StrideA,
                      int StrideB,
-                      int StrideC)
+                      int StrideC,
+                      int n_warmup,
+                      int n_iter)
 {
    bool pass = true;

@@ -165,8 +167,8 @@ int profile_gemm_impl(int do_verification,

            std::string op_name = op_ptr->GetTypeString();

-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, 10, 50});
+            float avg_time = invoker_ptr->Run(
+                argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});

            std::size_t flop = std::size_t(2) * M * N * K;

@@ -296,7 +298,7 @@ int profile_gemm_impl(int do_verification,
        }
    }

-    return pass ? 0 : 1;
+    return pass;
 }

 } // namespace profiler

--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -42,7 +42,9 @@ bool profile_gemm_splitk_impl(int do_verification,
                              int StrideA,
                              int StrideB,
                              int StrideC,
-                              int KBatch)
+                              int KBatch,
+                              int n_warmup,
+                              int n_iter)
 {
    bool pass = true;

@@ -143,7 +145,7 @@ bool profile_gemm_splitk_impl(int do_verification,
    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 32, 36, 40, 64, 96, 128};
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 19, 20, 32, 38};

        if(KBatch > 0)
        {
@@ -177,7 +179,8 @@ bool profile_gemm_splitk_impl(int do_verification,
                // re-init C to zero before profiling next kernel
                c_device_buf.SetZero();

-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});

                if(do_verification)
                {
@@ -200,8 +203,8 @@ bool profile_gemm_splitk_impl(int do_verification,

                std::string op_name = op_ptr->GetTypeString();

-                float ave_time =
-                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                float ave_time = invoker_ptr->Run(
+                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});

                std::size_t flop = std::size_t(2) * M * N * K;


--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -42,7 +42,9 @@ bool profile_grouped_gemm_impl(int do_verification,
                               const std::vector<int>& StrideAs,
                               const std::vector<int>& StrideBs,
                               const std::vector<int>& StrideCs,
-                               int kbatch = 1)
+                               int kbatch   = 1,
+                               int n_warmup = 1,
+                               int n_iter   = 10)
 {
    bool pass = true;

@@ -261,7 +263,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                for(std::size_t i = 0; i < gemm_descs.size(); i++)
                    c_device_buf[i]->SetZero();

-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});

                if(do_verification)
                {
@@ -307,8 +310,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                    pass = pass && instance_pass;
                }

-                float ave_time =
-                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                float ave_time = invoker_ptr->Run(
+                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});

                if(time_kernel)
                {

--- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_data.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename MeanInvStdDataType,
+          typename ComputeDataType,
+          typename DXDataType>
+bool profile_groupnorm_bwd_data_impl(int do_verification,
+                                     int init_method,
+                                     bool do_log,
+                                     bool time_kernel,
+                                     std::vector<index_t> length)
+{
+    // we don't need DGamma and DBeta here, just for reference class
+    using DGammaDataType = DXDataType;
+    using DBetaDataType  = DXDataType;
+
+    if(length.size() != 5)
+        return false;
+
+    index_t N = length[0];
+    index_t G = length[3];
+    index_t C = length[4];
+
+    std::vector<index_t> reduce_dim  = {1, 2, 4};
+    std::vector<index_t> gammaLength = {G, C};
+
+    Tensor<DYDataType> dy(length);
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<MeanInvStdDataType> mean({N, G});
+    Tensor<MeanInvStdDataType> inv_std({N, G});
+    Tensor<DXDataType> dx(length);
+
+    Tensor<DXDataType> host_dx(length);
+    Tensor<DGammaDataType> host_dgamma({G, C});
+    Tensor<DBetaDataType> host_dbeta({G, C});
+
+    std::vector<index_t> strideDy =
+        std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<index_t> strideX  = strideDy;
+    std::vector<index_t> strideDx = strideDy;
+
+    std::vector<index_t> strideGamma      = {0, 0, 0, C, 1};
+    std::vector<index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
+
+    switch(init_method)
+    {
+    case 0:
+        dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        dx.GenerateTensorValue(GeneratorTensor_1<DXDataType>{});
+        break;
+    case 1:
+        dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        dx.GenerateTensorValue(GeneratorTensor_2<DXDataType>{-5, 5});
+        break;
+    default:
+        dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        dx.GenerateTensorValue(GeneratorTensor_3<DXDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dx_dev(sizeof(DXDataType) * dx.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData<DYDataType,
+                                                                              XDataType,
+                                                                              GammaDataType,
+                                                                              MeanInvStdDataType,
+                                                                              DXDataType,
+                                                                              5,
+                                                                              3>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
+                                                          strideDy,
+                                                          strideX,
+                                                          strideGamma,
+                                                          strideMeanInvStd,
+                                                          strideMeanInvStd,
+                                                          strideDx,
+                                                          reduce_dim,
+                                                          dy_dev.GetDeviceBuffer(),
+                                                          x_dev.GetDeviceBuffer(),
+                                                          gamma_dev.GetDeviceBuffer(),
+                                                          mean_dev.GetDeviceBuffer(),
+                                                          inv_std_dev.GetDeviceBuffer(),
+                                                          dx_dev.GetDeviceBuffer());
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
+                                x.mDesc.GetElementSize() * sizeof(XDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                                inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                                dx.mDesc.GetElementSize() * sizeof(DXDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            dx_dev.FromDevice(dx.mData.data());
+            bool pass = ck::utils::check_err(
+                dx.mData, host_dx.mData, "Error: Incorrect results", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "dy  : ", dy.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_dx  : ", host_dx.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "dx  : ", dx.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename ComputeDataType,
+          typename DGammaDataType,
+          typename DBetaDataType>
+bool profile_groupnorm_bwd_gamma_beta_impl(int do_verification,
+                                           int init_method,
+                                           bool do_log,
+                                           bool time_kernel,
+                                           std::vector<index_t> length)
+{
+    // we don't need GammaDataType and DXDataType here, just for reference class
+    using GammaDataType = DYDataType;
+    using DXDataType    = DYDataType;
+
+    if(length.size() != 5)
+        return false;
+
+    index_t N = length[0];
+    index_t G = length[3];
+    index_t C = length[4];
+
+    std::vector<index_t> reduce_dim        = {0, 1, 2};
+    std::vector<index_t> gamma_beta_length = {G, C};
+
+    Tensor<DYDataType> dy(length);
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma(gamma_beta_length); // dummy tensor, for reference
+    Tensor<MeanInvStdDataType> mean({N, G});
+    Tensor<MeanInvStdDataType> inv_std({N, G});
+    Tensor<DGammaDataType> dgamma(gamma_beta_length);
+    Tensor<DBetaDataType> dbeta(gamma_beta_length);
+
+    Tensor<DXDataType> host_dx(length); // dummy tensor, for reference
+    Tensor<DGammaDataType> host_dgamma(gamma_beta_length);
+    Tensor<DBetaDataType> host_dbeta(gamma_beta_length);
+
+    std::vector<index_t> strideDy =
+        std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<index_t> strideX =
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
+
+    std::vector<index_t> strideDGamma{dgamma.mDesc.GetStrides().begin(),
+                                      dgamma.mDesc.GetStrides().end()};
+
+    std::vector<index_t> strideDBeta{dbeta.mDesc.GetStrides().begin(),
+                                     dbeta.mDesc.GetStrides().end()};
+
+    std::vector<index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
+
+    switch(init_method)
+    {
+    case 0:
+        dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        dgamma.GenerateTensorValue(GeneratorTensor_1<DGammaDataType>{});
+        dbeta.GenerateTensorValue(GeneratorTensor_1<DBetaDataType>{});
+        break;
+    case 1:
+        dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{0, 5});
+        dgamma.GenerateTensorValue(GeneratorTensor_2<DGammaDataType>{-5, 5});
+        dbeta.GenerateTensorValue(GeneratorTensor_2<DBetaDataType>{-5, 5});
+        break;
+    default:
+        dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0, 0.5});
+        dgamma.GenerateTensorValue(GeneratorTensor_3<DGammaDataType>{-0.5, 0.5});
+        dbeta.GenerateTensorValue(GeneratorTensor_3<DBetaDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
+    DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    // add device normalization instances
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                      XDataType,
+                                                                      MeanInvStdDataType,
+                                                                      DGammaDataType,
+                                                                      DBetaDataType,
+                                                                      5,
+                                                                      3>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
+                            x.mDesc.GetElementSize() * sizeof(XDataType) +
+                            mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                            inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                            dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) +
+                            dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType);
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
+                                                          strideDy,
+                                                          strideX,
+                                                          strideMeanInvStd,
+                                                          strideMeanInvStd,
+                                                          gamma_beta_length,
+                                                          strideDGamma,
+                                                          strideDBeta,
+                                                          reduce_dim,
+                                                          dy_dev.GetDeviceBuffer(),
+                                                          x_dev.GetDeviceBuffer(),
+                                                          mean_dev.GetDeviceBuffer(),
+                                                          inv_std_dev.GetDeviceBuffer(),
+                                                          dgamma_dev.GetDeviceBuffer(),
+                                                          dbeta_dev.GetDeviceBuffer());
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            dgamma_dev.FromDevice(dgamma.mData.data());
+            dbeta_dev.FromDevice(dbeta.mData.data());
+            bool pass =
+                ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
+
+            pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "dy  : ", dy.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_dgamma  : ", host_dgamma.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(std::cout << "dgamma  : ", dgamma.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_data.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename GammaDataType,
+          typename MeanInvStdDataType,
+          typename ComputeDataType,
+          typename DXDataType,
+          index_t Rank>
+bool profile_layernorm_bwd_data_impl(int do_verification,
+                                     int init_method,
+                                     bool do_log,
+                                     bool time_kernel,
+                                     std::vector<index_t> length)
+{
+    // we don't need DGamma and DBeta here, just for reference class
+    using DGammaDataType = DXDataType;
+    using DBetaDataType  = DXDataType;
+
+    if(length.size() != Rank || Rank < 2)
+        return false;
+
+    // Assume normalize dimension except for batch (first) dimension
+    std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
+    std::vector<index_t> reduce_dim;
+    for(int i = 1; i < Rank; ++i)
+        reduce_dim.push_back(i);
+
+    Tensor<DYDataType> dy(length);
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma(reduce_length);
+    Tensor<MeanInvStdDataType> mean({length[0]});
+    Tensor<MeanInvStdDataType> inv_std({length[0]});
+    Tensor<DXDataType> dx(length);
+
+    Tensor<DXDataType> host_dx(length);
+    Tensor<DGammaDataType> host_dgamma(reduce_length);
+    Tensor<DBetaDataType> host_dbeta(reduce_length);
+
+    std::vector<index_t> strideDy =
+        std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<index_t> strideX  = strideDy;
+    std::vector<index_t> strideDx = strideDy;
+
+    std::vector<index_t> strideGamma = strideDy;
+    strideGamma[0]                   = 0;
+
+    std::vector<index_t> strideMeanInvStd{Rank, 0};
+    strideMeanInvStd[0] = 1;
+
+    switch(init_method)
+    {
+    case 0:
+        dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        dx.GenerateTensorValue(GeneratorTensor_1<DXDataType>{});
+        break;
+    case 1:
+        dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        dx.GenerateTensorValue(GeneratorTensor_2<DXDataType>{-5, 5});
+        break;
+    default:
+        dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        dx.GenerateTensorValue(GeneratorTensor_3<DXDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dx_dev(sizeof(DXDataType) * dx.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    constexpr int NumReduceDim = Rank - 1;
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData<DYDataType,
+                                                                              XDataType,
+                                                                              GammaDataType,
+                                                                              MeanInvStdDataType,
+                                                                              DXDataType,
+                                                                              Rank,
+                                                                              NumReduceDim>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
+                                                          strideDy,
+                                                          strideX,
+                                                          strideGamma,
+                                                          strideMeanInvStd,
+                                                          strideMeanInvStd,
+                                                          strideDx,
+                                                          reduce_dim,
+                                                          dy_dev.GetDeviceBuffer(),
+                                                          x_dev.GetDeviceBuffer(),
+                                                          gamma_dev.GetDeviceBuffer(),
+                                                          mean_dev.GetDeviceBuffer(),
+                                                          inv_std_dev.GetDeviceBuffer(),
+                                                          dx_dev.GetDeviceBuffer());
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
+                                x.mDesc.GetElementSize() * sizeof(XDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                                inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                                dx.mDesc.GetElementSize() * sizeof(DXDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            dx_dev.FromDevice(dx.mData.data());
+            bool pass = ck::utils::check_err(
+                dx.mData, host_dx.mData, "Error: Incorrect results", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "dy  : ", dy.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_dx  : ", host_dx.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "dx  : ", dx.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DYDataType,
+          typename XDataType,
+          typename MeanInvStdDataType,
+          typename ComputeDataType,
+          typename DGammaDataType,
+          typename DBetaDataType,
+          index_t Rank>
+bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
+                                           int init_method,
+                                           bool do_log,
+                                           bool time_kernel,
+                                           std::vector<index_t> length)
+{
+    // we don't need GammaDataType and DXDataType here, just for reference class
+    using GammaDataType = DYDataType;
+    using DXDataType    = DYDataType;
+
+    if(length.size() != Rank || Rank < 2)
+        return false;
+
+    // Assume normalize dimension for first dimension
+    // Layernorm 2D, input = [M, K], reduce on M axis
+    // Layernorm 4D, input = [N, H, W, C], redice on N axis
+    constexpr int NumReduceDim = Rank - 1;
+
+    std::vector<index_t> reduce_dim = {0};
+    std::vector<index_t> invarient_length{length.begin() + 1, length.end()};
+
+    Tensor<DYDataType> dy(length);
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma(invarient_length); // dummy tensor, for reference
+    Tensor<MeanInvStdDataType> mean({length[0]});
+    Tensor<MeanInvStdDataType> inv_std({length[0]});
+    Tensor<DGammaDataType> dgamma(invarient_length);
+    Tensor<DBetaDataType> dbeta(invarient_length);
+
+    Tensor<DXDataType> host_dx(length); // dummy tensor, for reference
+    Tensor<DGammaDataType> host_dgamma(invarient_length);
+    Tensor<DBetaDataType> host_dbeta(invarient_length);
+
+    std::vector<index_t> strideDy =
+        std::vector<ck::index_t>{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
+    std::vector<index_t> strideX = strideDy;
+
+    std::vector<index_t> strideDGamma{dgamma.mDesc.GetStrides().begin(),
+                                      dgamma.mDesc.GetStrides().end()};
+
+    std::vector<index_t> strideDBeta{dbeta.mDesc.GetStrides().begin(),
+                                     dbeta.mDesc.GetStrides().end()};
+
+    std::vector<index_t> strideMeanInvStd{Rank, 0};
+    strideMeanInvStd[0] = 1;
+
+    switch(init_method)
+    {
+    case 0:
+        dy.GenerateTensorValue(GeneratorTensor_1<DYDataType>{});
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        mean.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        inv_std.GenerateTensorValue(GeneratorTensor_1<MeanInvStdDataType>{});
+        dgamma.GenerateTensorValue(GeneratorTensor_1<DGammaDataType>{});
+        dbeta.GenerateTensorValue(GeneratorTensor_1<DBetaDataType>{});
+        break;
+    case 1:
+        dy.GenerateTensorValue(GeneratorTensor_2<DYDataType>{-5, 5});
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{-5, 5});
+        inv_std.GenerateTensorValue(GeneratorTensor_2<MeanInvStdDataType>{0, 5});
+        dgamma.GenerateTensorValue(GeneratorTensor_2<DGammaDataType>{-5, 5});
+        dbeta.GenerateTensorValue(GeneratorTensor_2<DBetaDataType>{-5, 5});
+        break;
+    default:
+        dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0, 1});
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{-0.5, 0.5});
+        inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0, 0.5});
+        dgamma.GenerateTensorValue(GeneratorTensor_3<DGammaDataType>{-0.5, 0.5});
+        dbeta.GenerateTensorValue(GeneratorTensor_3<DBetaDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
+    DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
+    DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());
+
+    dy_dev.ToDevice(dy.mData.data());
+    x_dev.ToDevice(x.mData.data());
+    mean_dev.ToDevice(mean.mData.data());
+    inv_std_dev.ToDevice(inv_std.mData.data());
+
+    // add device normalization instances
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                      XDataType,
+                                                                      MeanInvStdDataType,
+                                                                      DGammaDataType,
+                                                                      DBetaDataType,
+                                                                      Rank,
+                                                                      NumReduceDim>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernormBwd<DYDataType,
+                                                              XDataType,
+                                                              GammaDataType,
+                                                              MeanInvStdDataType,
+                                                              DGammaDataType,
+                                                              DBetaDataType,
+                                                              DXDataType,
+                                                              ComputeDataType>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) +
+                            x.mDesc.GetElementSize() * sizeof(XDataType) +
+                            mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                            inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) +
+                            dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) +
+                            dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType);
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
+                                                          strideDy,
+                                                          strideX,
+                                                          strideMeanInvStd,
+                                                          strideMeanInvStd,
+                                                          invarient_length,
+                                                          strideDGamma,
+                                                          strideDBeta,
+                                                          reduce_dim,
+                                                          dy_dev.GetDeviceBuffer(),
+                                                          x_dev.GetDeviceBuffer(),
+                                                          mean_dev.GetDeviceBuffer(),
+                                                          inv_std_dev.GetDeviceBuffer(),
+                                                          dgamma_dev.GetDeviceBuffer(),
+                                                          dbeta_dev.GetDeviceBuffer());
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            dgamma_dev.FromDevice(dgamma.mData.data());
+            dbeta_dev.FromDevice(dbeta.mData.data());
+            bool pass =
+                ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
+
+            pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "dy  : ", dy.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_dgamma  : ", host_dgamma.mData, ",")
+                    << std::endl;
+                LogRangeAsType<float>(std::cout << "dgamma  : ", dgamma.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s,"
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck