Merge branch 'develop' into uif2-initial

c5138aa1 · Artur Wojcik · 7830272f · 82f3a835 · c5138aa1 · c5138aa1
Commit c5138aa1 authored Oct 19, 2023 by Artur Wojcik
20 changed files
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_relu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Relu;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_sigmoid_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Sigmoid;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_softrelu_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::SoftRelu;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_tanh_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::TanH;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
--- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+bool run_convnd_fwd_example(int argc, char* argv[])
+{
+    print_helper_msg();
+
+    bool do_verification = true;
+    // Use floats for SoftRelu by default to avoid overflow after e^x.
+    int init_method =
+        std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> ? 2 : 1;
+    bool time_kernel = false;
+
+    // Following shapes are selected to avoid overflow. Expect inf in case of
+    // size increase for some elementwise ops.
+    ck::utils::conv::ConvParam conv_param{
+        3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto run = [&]() {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_grouped_conv_fwd<NDimSpatial,
+                                    InDataType,
+                                    WeiDataType,
+                                    OutDataType,
+                                    InElementOp,
+                                    WeiElementOp,
+                                    OutElementOp,
+                                    DeviceGroupedConvNDFwdActivInstance>(do_verification,
+                                                                         init_method,
+                                                                         time_kernel,
+                                                                         conv_param,
+                                                                         in_g_n_c_wis_desc,
+                                                                         wei_g_k_c_xs_desc,
+                                                                         out_g_n_k_wos_desc,
+                                                                         in_element_op,
+                                                                         wei_element_op,
+                                                                         out_element_op);
+    };
+
+    if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run();
+    }
+
+    return false;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -62,6 +62,12 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    set(result ${result} PARENT_SCOPE)
 endfunction()

+function(add_example_dependencies EXAMPLE_NAME FILE_NAME)
+    if(result EQUAL 0)
+        add_dependencies(${EXAMPLE_NAME} ${FILE_NAME})
+    endif()
+endfunction(add_example_dependencies EXAMPLE_NAME)
+
 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    message("adding example ${EXAMPLE_NAME}")
    set(result 1)

--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M0, M1, ... K0, K1, ...], ...
+//   input : B0[N0, N1, ... K0, K1, ...], ...
+//   input : D0[M0, M1, ... N0, N1, ...], D1[M0, M1, ... N0, N1, ...], ...
+//   output : E[M0, M1, ... N0, N1, ...]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_lengths,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_strides,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_lengths,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_length,
+                        const std::vector<index_t>& e_ms_ns_stride,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -14,8 +14,8 @@ namespace device {
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename ComputeDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
@@ -27,6 +27,8 @@ struct DeviceNormalization : public BaseOperator
                        const std::vector<index_t> gammaStrides,
                        const std::vector<index_t> betaStrides,
                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> saveMeanStrides,
+                        const std::vector<index_t> saveInvStdStrides,
                        const std::vector<index_t> reduceDims,
                        double epsilon,
                        const void* p_x,
@@ -43,16 +45,16 @@ struct DeviceNormalization : public BaseOperator
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
-          typename ComputeDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim>
 using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
                                                                   GammaDataType,
                                                                   BetaDataType,
-                                                                   ComputeDataType,
                                                                   YDataType,
+                                                                   SaveMeanInvStdDataType,
                                                                   YElementwiseOperation,
                                                                   Rank,
                                                                   NumReduceDim>>;

--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -127,7 +127,50 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
        PipelineVer,
        ComputeType>;

-    using Argument              = typename GridwiseGemm::Argument;
+    struct Argument : public GridwiseGemm::Argument
+    {
+        Argument(const ADataType* p_a_grid_,
+                 const BDataType* p_b_grid_,
+                 CDataType* p_c_grid_,
+                 index_t M_,
+                 index_t N_,
+                 index_t K_,
+                 index_t StrideA_,
+                 index_t StrideB_,
+                 index_t StrideC_,
+                 index_t MPadded_,
+                 index_t NPadded_,
+                 index_t KPadded_,
+                 index_t K0_,
+                 index_t k_batch_,
+                 AElementwiseOperation a_element_op_,
+                 BElementwiseOperation b_element_op_,
+                 CElementwiseOperation c_element_op_)
+            : GridwiseGemm::Argument(p_a_grid_,
+                                     p_b_grid_,
+                                     p_c_grid_,
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     StrideC_,
+                                     MPadded_,
+                                     NPadded_,
+                                     KPadded_,
+                                     K0_,
+                                     k_batch_),
+              a_element_op(a_element_op_),
+              b_element_op(b_element_op_),
+              c_element_op(c_element_op_)
+        {
+        }
+
+        AElementwiseOperation a_element_op;
+        BElementwiseOperation b_element_op;
+        CElementwiseOperation c_element_op;
+    };
+
    using DefaultBlock2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;

    // Invoker
@@ -168,8 +211,17 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                     karg.M * karg.N * sizeof(CDataType),
                                                     stream_config.stream_id_));

-                ave_time = launch_and_time_kernel(
-                    stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, karg, b2c_map);
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(gdx, gdy, gdz),
+                                           dim3(BlockSize),
+                                           0,
+                                           static_cast<typename GridwiseGemm::Argument>(karg),
+                                           b2c_map,
+                                           karg.a_element_op,
+                                           karg.b_element_op,
+                                           karg.c_element_op);
            };

            if(has_main_k0_block_loop)
@@ -180,7 +232,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                             true,
                                                             InMemoryDataOperationEnum::Set,
-                                                             DefaultBlock2CTileMap>;
+                                                             DefaultBlock2CTileMap,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>;

                    Run(kernel);
                }
@@ -190,7 +245,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                             true,
                                                             InMemoryDataOperationEnum::AtomicAdd,
-                                                             DefaultBlock2CTileMap>;
+                                                             DefaultBlock2CTileMap,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>;

                    Run(kernel);
                }
@@ -203,7 +261,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                             false,
                                                             InMemoryDataOperationEnum::Set,
-                                                             DefaultBlock2CTileMap>;
+                                                             DefaultBlock2CTileMap,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>;

                    Run(kernel);
                }
@@ -213,7 +274,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        kernel_gemm_xdlops_v2r4r2_simplified<GridwiseGemm,
                                                             false,
                                                             InMemoryDataOperationEnum::AtomicAdd,
-                                                             DefaultBlock2CTileMap>;
+                                                             DefaultBlock2CTileMap,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>;

                    Run(kernel);
                }
@@ -261,12 +325,12 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                             index_t StrideA,
                             index_t StrideB,
                             index_t StrideC,
-                             AElementwiseOperation,
-                             BElementwiseOperation,
-                             CElementwiseOperation,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op,
                             index_t KBatch)
    {
-        return Argument{p_a,
+        return Argument(p_a,
                        p_b,
                        p_c,
                        M,
@@ -279,7 +343,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                        GridwiseGemm::CalculateNPadded(N),
                        GridwiseGemm::CalculateKPadded(K, KBatch),
                        GridwiseGemm::CalculateK0(K, KBatch),
-                        KBatch};
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op);
    }

    static auto MakeInvoker() { return Invoker{}; }
@@ -294,9 +361,9 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                                      index_t StrideA,
                                                      index_t StrideB,
                                                      index_t StrideC,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
                                                      ck::index_t KBatch = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
@@ -312,7 +379,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
                                          GridwiseGemm::CalculateNPadded(N),
                                          GridwiseGemm::CalculateKPadded(K, KBatch),
                                          GridwiseGemm::CalculateK0(K, KBatch),
-                                          KBatch);
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
    }

    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -28,6 +28,7 @@ template <typename XDataType,
          typename BetaDataType,
          typename ComputeDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename YElementwiseOperation,
          index_t Rank,
          index_t NumReduceDim,
@@ -43,12 +44,13 @@ template <typename XDataType,
          index_t BetaSrcVectorDim,
          index_t BetaSrcVectorSize,
          index_t YDstVectorSize,
+          index_t SaveMeanInvStdDstVectorSize,
          bool UseWelford = true>
 struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                            GammaDataType,
                                                            BetaDataType,
-                                                            ComputeDataType,
                                                            YDataType,
+                                                            SaveMeanInvStdDataType,
                                                            YElementwiseOperation,
                                                            Rank,
                                                            NumReduceDim>
@@ -64,18 +66,24 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
         (BetaSrcVectorDim == 1 && KThreadSliceSize % BetaSrcVectorSize == 0)),
        "Invalid thread slice sizes and/or beta vector sizes configuration, please check!");

+    static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
+                  "Invalid thread slice sizes and/or save mean and inverse std vector sizes "
+                  "configuration, please check!");
+
    using PassThrough = tensor_operation::element_wise::PassThrough;

+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;

+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+    static_assert(!reduceAllDim); // TODO
+
    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
                                    const std::vector<index_t>& inStrides,
                                    int numBlockTileIteration)
    {
-        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
        static constexpr index_t numSrcDim = Rank;
-        static constexpr bool reduceAllDim = (NumInvariantDim == 0);

        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
@@ -133,7 +141,37 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        return (in_grid_desc_m_k_padded);
    };

+    static auto MakeSaveMeanInvStdDescriptor_M(const std::vector<index_t>& lengths,
+                                               const std::vector<index_t>& strides)
+    {
+        using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+
+        const auto tupleSrcLengths = make_tuple_from_array_and_index_seq(lengths, InvariantDims{});
+        const auto tupleSrcStrides = make_tuple_from_array_and_index_seq(strides, InvariantDims{});
+
+        const auto desc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto grid_desc_m =
+            transform_tensor_descriptor(desc,
+                                        make_tuple(make_merge_transform(tupleSrcLengths)),
+                                        make_tuple(InvariantDims{}),
+                                        make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
+        const auto pad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_padded = transform_tensor_descriptor(
+            grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, pad_M)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+
+        return grid_desc_m_padded;
+    }
+
    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1));
+    using GridDesc_M   = decltype(MakeSaveMeanInvStdDescriptor_M({1}, {1}));

    struct Argument : public BaseArgument
    {
@@ -142,17 +180,23 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                 const std::vector<index_t> gammaStrides,
                 const std::vector<index_t> betaStrides,
                 const std::vector<index_t> yStrides,
+                 const std::vector<index_t> saveMeanStrides,
+                 const std::vector<index_t> saveInvStdStrides,
                 const std::vector<index_t> reduceDims,
                 YElementwiseOperation y_elementwise_op,
                 double epsilon,
                 const XDataType* p_x,
                 const GammaDataType* p_gamma,
                 const BetaDataType* p_beta,
-                 YDataType* p_y)
+                 YDataType* p_y,
+                 SaveMeanInvStdDataType* p_saveMean,
+                 SaveMeanInvStdDataType* p_saveInvStd)
            : p_x_(p_x),
              p_gamma_(p_gamma),
              p_beta_(p_beta),
              p_y_(p_y),
+              p_saveMean_(p_saveMean),
+              p_saveInvStd_(p_saveInvStd),
              y_elementwise_op_(y_elementwise_op)
        {
            epsilon_ = static_cast<ComputeDataType>(epsilon);
@@ -162,16 +206,14 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
            yStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
+            saveMeanStrides_   = saveMeanStrides;
+            saveInvStdStrides_ = saveInvStdStrides;

-            long_index_t invariant_length;
-            long_index_t reduce_length;
-
-            std::tie(invariant_length, reduce_length) =
-                get_2d_lengths<Rank, NumReduceDim>(Lengths_);
+            std::tie(MRaw_, KRaw_) = get_2d_lengths<Rank, NumReduceDim>(Lengths_);

-            numBlockTileIteration_ = math::integer_divide_ceil(reduce_length, K_BlockTileSize);
+            numBlockTileIteration_ = math::integer_divide_ceil(KRaw_, K_BlockTileSize);

-            gridSize_ = math::integer_divide_ceil(invariant_length, M_BlockTileSize);
+            gridSize_ = math::integer_divide_ceil(MRaw_, M_BlockTileSize);

            x_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, xStrides_, numBlockTileIteration_);
            gamma_grid_desc_m_k_ =
@@ -179,9 +221,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
            beta_grid_desc_m_k_ =
                MakeSrc2dDescriptor(Lengths_, betaStrides_, numBlockTileIteration_);
            y_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, yStrides_, numBlockTileIteration_);
+            save_mean_grid_desc_m_    = MakeSaveMeanInvStdDescriptor_M(Lengths_, saveMeanStrides);
+            save_inv_std_grid_desc_m_ = MakeSaveMeanInvStdDescriptor_M(Lengths_, saveInvStdStrides);

            isSweeponce_ =
                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+
+            if constexpr(NumInvariantDim == 0)
+                invariant_lowest_length_ = 1;
+            else
+                invariant_lowest_length_ = Lengths_[NumInvariantDim - 1];
        }

        ComputeDataType epsilon_;
@@ -190,12 +239,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        const GammaDataType* p_gamma_;
        const BetaDataType* p_beta_;
        YDataType* p_y_;
+        SaveMeanInvStdDataType* p_saveMean_;
+        SaveMeanInvStdDataType* p_saveInvStd_;

        std::vector<index_t> Lengths_;
        std::vector<index_t> xStrides_;
        std::vector<index_t> gammaStrides_;
        std::vector<index_t> betaStrides_;
        std::vector<index_t> yStrides_;
+        std::vector<index_t> saveMeanStrides_;
+        std::vector<index_t> saveInvStdStrides_;

        YElementwiseOperation y_elementwise_op_;

@@ -206,7 +259,14 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        GridDesc_M_K gamma_grid_desc_m_k_;
        GridDesc_M_K beta_grid_desc_m_k_;
        GridDesc_M_K y_grid_desc_m_k_;
+        GridDesc_M save_mean_grid_desc_m_;
+        GridDesc_M save_inv_std_grid_desc_m_;
        bool isSweeponce_;
+
+        index_t MRaw_; // invarient length
+        index_t KRaw_; // reduce length
+
+        index_t invariant_lowest_length_;
    };

    struct Invoker : public BaseInvoker
@@ -217,9 +277,11 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                           GammaDataType,
                                                           BetaDataType,
                                                           YDataType,
+                                                           SaveMeanInvStdDataType,
                                                           ComputeDataType,
                                                           YElementwiseOperation,
                                                           GridDesc_M_K,
+                                                           GridDesc_M,
                                                           BlockSize,
                                                           MThreadClusterSize,
                                                           KThreadClusterSize,
@@ -233,6 +295,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                           BetaSrcVectorSize,
                                                           XYSrcVectorDim,
                                                           YDstVectorSize,
+                                                           SaveMeanInvStdDstVectorSize,
                                                           UseWelford>(arg.isSweeponce_);

            float avg_time = 0;
@@ -245,12 +308,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                               arg.gamma_grid_desc_m_k_,
                                               arg.beta_grid_desc_m_k_,
                                               arg.y_grid_desc_m_k_,
+                                               arg.save_mean_grid_desc_m_,
+                                               arg.save_inv_std_grid_desc_m_,
                                               arg.numBlockTileIteration_,
                                               arg.epsilon_,
                                               arg.p_x_,
                                               arg.p_gamma_,
                                               arg.p_beta_,
                                               arg.p_y_,
+                                               arg.p_saveMean_,
+                                               arg.p_saveInvStd_,
                                               arg.y_elementwise_op_);

            return (avg_time);
@@ -267,8 +334,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
    {
        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);

-        constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-
        if constexpr(XYSrcVectorDim == 0)
        {
            if constexpr(NumInvariantDim == 0)
@@ -277,13 +342,15 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
            }
            else
            {
+                printf("!!!! %d\n", p_arg_->invariant_lowest_length_);
+
                if(p_arg_->xStrides_[NumInvariantDim - 1] != 1)
                    return false;

-                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
+                if(p_arg_->invariant_lowest_length_ % XSrcVectorSize != 0)
                    return false;

-                if(p_arg_->invariant_lowest_length % YDstVectorSize != 0)
+                if(p_arg_->invariant_lowest_length_ % YDstVectorSize != 0)
                    return false;
            };
        }
@@ -325,7 +392,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
            if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
                return (false);

-            if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
+            if(p_arg_->invariant_lowest_length_ % BetaSrcVectorSize != 0)
                return (false);
        }
        else // if fastest dim is reduced
@@ -337,6 +404,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                return (false);
        }

+        if(p_arg_->invariant_lowest_length_ % SaveMeanInvStdDstVectorSize != 0)
+            return false;
+
        return true;
    };

@@ -346,6 +416,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                        const std::vector<index_t> gammaStrides,
                        const std::vector<index_t> betaStrides,
                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> saveMeanStrides,
+                        const std::vector<index_t> saveInvStdStrides,
                        const std::vector<index_t> reduceDims,
                        double epsilon,
                        const void* p_x,
@@ -353,27 +425,30 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                        const void* p_beta,
                        void* p_y,
                        void* p_saveMean,
-                        void* p_saveInvVar,
+                        void* p_saveInvStd,
                        YElementwiseOperation y_elementwise_op) override
    {
-        // TODO
-        // Optional cache of the intermediate results (mean and InvVariance) during the
-        // forward pass could speedup in the backward
-        ignore = p_saveMean;
-        ignore = p_saveInvVar;
+        if(lengths.size() != Rank || xStrides.size() != Rank || gammaStrides.size() != Rank ||
+           betaStrides.size() != Rank || yStrides.size() != Rank ||
+           saveMeanStrides.size() != NumInvariantDim || saveInvStdStrides.size() != NumInvariantDim)
+            throw std::runtime_error("dimension is incorrect");

        return std::make_unique<Argument>(lengths,
                                          xStrides,
                                          gammaStrides,
                                          betaStrides,
                                          yStrides,
+                                          saveMeanStrides,
+                                          saveInvStdStrides,
                                          reduceDims,
                                          y_elementwise_op,
                                          epsilon,
                                          static_cast<const XDataType*>(p_x),
                                          static_cast<const GammaDataType*>(p_gamma),
                                          static_cast<const BetaDataType*>(p_beta),
-                                          static_cast<YDataType*>(p_y));
+                                          static_cast<YDataType*>(p_y),
+                                          static_cast<SaveMeanInvStdDataType*>(p_saveMean),
+                                          static_cast<SaveMeanInvStdDataType*>(p_saveInvStd));
    };

    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -118,7 +118,6 @@ struct PassThrough
    }
 #endif

-#if defined CK_ENABLE_FP8
    template <>
    __host__ __device__ void operator()<f8_t, f8_t>(f8_t& y, const f8_t& x) const
    {
@@ -148,9 +147,7 @@ struct PassThrough
    {
        y = type_convert<f8_t>(x);
    }
-#endif

-#if defined CK_ENABLE_BF8
    template <>
    __host__ __device__ void operator()<bf8_t, bf8_t>(bf8_t& y, const bf8_t& x) const
    {
@@ -180,7 +177,6 @@ struct PassThrough
    {
        y = ck::type_convert<bf8_t>(x);
    }
-#endif
 };

 struct UnaryConvert
@@ -209,7 +205,6 @@ struct ConvertBF16RTN
    }
 };

-#if defined CK_ENABLE_FP8
 struct ConvertF8SR
 {
    // convert to fp8 using stochastic rounding (SR)
@@ -226,7 +221,6 @@ struct ConvertF8SR
        y = f8_convert_sr<Y>(x);
    }
 };
-#endif

 struct Scale
 {
@@ -453,10 +447,11 @@ struct Sigmoid
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value,
+                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");
-
-        y = 1 / (ck::type_convert<T>(1) + exp(-x));
+        constexpr T one = type_convert<T>(1);
+        y               = one / (one + ck::math::exp(-x));
    };
 };

@@ -466,7 +461,8 @@ struct TanH
    __host__ __device__ void operator()(T& y, const T& x) const
    {
        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, ck::half_t>::value,
+                          is_same<T, ck::half_t>::value || is_same<T, int8_t>::value ||
+                          is_same<T, int32_t>::value,
                      "Data type is not supported by this operation!");

        y = ck::math::tanh(x);
@@ -492,7 +488,101 @@ struct Swish
        y        = type_convert<Y>(x / (1.f + ck::math::exp(bx)));
    };

-    float beta_ = 1.0f;
+    const float beta_;
+};
+
+struct SoftRelu
+{
+    SoftRelu(float alpha = 1.f) : alpha_(alpha){};
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha  = type_convert<T>(alpha_);
+        constexpr T one = type_convert<T>(1);
+        y               = ck::math::log(one + ck::math::exp(x * casted_alpha)) / casted_alpha;
+    }
+    const float alpha_;
+};
+
+struct Power
+{
+    Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f)
+        : alpha_(alpha), beta_(beta), gamma_(gamma){};
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha     = type_convert<T>(alpha_);
+        T casted_beta      = type_convert<T>(beta_);
+        T casted_gamma     = type_convert<T>(gamma_);
+        T shifted_scaled_x = casted_alpha + casted_beta * x;
+        y                  = ck::math::pow(shifted_scaled_x, casted_gamma);
+    }
+    const float alpha_;
+    const float beta_;
+    const float gamma_;
+};
+
+struct ClippedRelu
+{
+    ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){};
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        T casted_beta  = type_convert<T>(beta_);
+        y              = ck::math::min(casted_beta, ck::math::max(casted_alpha, x));
+    }
+    const float alpha_;
+    const float beta_;
+};
+
+struct LeakyRelu
+{
+    LeakyRelu(float alpha = 0.01f) : alpha_(alpha){};
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x >= 0 ? x : x * casted_alpha;
+    }
+    const float alpha_;
+};
+
+struct Elu
+{
+    Elu(float alpha = 1.f) : alpha_(alpha){};
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        T casted_alpha = type_convert<T>(alpha_);
+        y              = x > 0 ? x : casted_alpha * ck::math::expm1(x);
+    }
+    const float alpha_;
 };

 } // namespace element_wise

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -22,13 +22,19 @@ namespace ck {
 template <typename GridwiseGemm,
          bool HasMainKBlockLoop,
          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          typename Block2CTileMap>
+          typename Block2CTileMap,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
-                                             const Block2CTileMap& b2c_map)
+                                             const Block2CTileMap& b2c_map,
+                                             const AElementwiseOperation a_element_op,
+                                             const BElementwiseOperation b_element_op,
+                                             const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
@@ -37,10 +43,13 @@ __global__ void
    __shared__ uint8_t p_shared[shared_size];

    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
-        karg, static_cast<void*>(p_shared), b2c_map);
+        karg, static_cast<void*>(p_shared), b2c_map, a_element_op, b_element_op, c_element_op);
 #else
    ignore = karg;
    ignore = b2c_map;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

@@ -577,7 +586,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
              typename Block2CTileMap>
    __device__ static void Run(const Argument& karg,
                               void* __restrict__ p_shared_block,
-                               const Block2CTileMap& block_2_ctile_map)
+                               const Block2CTileMap& block_2_ctile_map,
+                               const AElementwiseOperation a_element_op = AElementwiseOperation{},
+                               const BElementwiseOperation b_element_op = BElementwiseOperation{},
+                               const CElementwiseOperation c_element_op = CElementwiseOperation{})
    {
        const FloatA* p_a_grid           = karg.p_a_grid;
        const FloatB* p_b_grid           = karg.p_b_grid;
@@ -590,9 +602,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
            MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
-        const AElementwiseOperation a_element_op = AElementwiseOperation{};
-        const BElementwiseOperation b_element_op = BElementwiseOperation{};
-        const CElementwiseOperation c_element_op = CElementwiseOperation{};

        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
@@ -761,8 +770,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
-            ComputeType,
-            ComputeType,
+            ComputeType, // ComputeType A
+            ComputeType, // ComputeType B
            FloatAcc,
            decltype(a_k0_m_k1_block_desc),
            decltype(b_k0_n_k1_block_desc),

--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp
@@ -18,9 +18,11 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename ComputeDataType,
          typename YElementwiseOperation,
          typename GridDesc_M_K,
+          typename GridDesc_M,
          index_t BlockSize,
          index_t MThreadClusterSize,
          index_t KThreadClusterSize,
@@ -34,6 +36,7 @@ template <typename XDataType,
          index_t BetaSrcVectorSize,
          index_t YDstVectorDim,
          index_t YDstVectorSize,
+          index_t SaveMeanInvStdDstVectorSize,
          bool SweepOnce>
 struct GridwiseNormalizationNaiveVariance_mk_to_mk
 {
@@ -45,6 +48,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");

+    static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
+                  "Invalid thread slice sizes and/or save mean and inverse std vector sizes "
+                  "configuration, please check!");
+
    static_assert(XSrcVectorSize == YDstVectorSize);
    static_assert(XSrcVectorSize == GammaSrcVectorSize);
    static_assert(XSrcVectorSize == BetaSrcVectorSize);
@@ -66,6 +73,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));

+    using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+
    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
    using ThreadReduceDstDesc_M =
@@ -84,6 +95,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                                                    reduce::Add,
                                                    true>;

+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};
@@ -98,12 +111,16 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                               const GridDesc_M_K& gamma_grid_desc_m_k,
                               const GridDesc_M_K& beta_grid_desc_m_k,
                               const GridDesc_M_K& y_grid_desc_m_k,
+                               const GridDesc_M& save_mean_grid_desc_m,
+                               const GridDesc_M& save_inv_std_grid_desc_m,
                               index_t num_k_block_tile_iteration,
                               ComputeDataType epsilon,
                               const XDataType* const __restrict__ p_x_global,
                               const GammaDataType* const __restrict__ p_gamma_global,
                               const BetaDataType* const __restrict__ p_beta_global,
                               YDataType* const __restrict__ p_y_global,
+                               SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
+                               SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
                               const YElementwiseOperation y_elementwise_op)
    {
        // LDS
@@ -115,6 +132,12 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());

+        auto save_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_save_mean_global, save_mean_grid_desc_m.GetElementSpaceSize());
+
+        auto save_inv_std_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_save_inv_std_global, save_inv_std_grid_desc_m.GetElementSpaceSize());
+
        auto x_thread_buf = generate_tuple(
            [&](auto) {
                return StaticBuffer<AddressSpaceEnum::Vgpr,
@@ -152,6 +175,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
            mean_square_thread_buf;
        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>&
            var_thread_buf = mean_square_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>&
+            inv_std_thread_buf = mean_square_thread_buf;

        const index_t thread_local_id = get_thread_local_1d_id();
        const index_t block_global_id = get_block_1d_id();
@@ -228,6 +253,42 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                                 thread_k_cluster_id * YDstVectorSize),
                y_elementwise_op);

+        auto threadwise_mean_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               SaveMeanInvStdDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               GridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,                 // DimAccessOrder
+                                               0,                           // SrcVectorDim
+                                               SaveMeanInvStdDstVectorSize, // ScalarPerVector
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                save_mean_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_inv_std_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               SaveMeanInvStdDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               GridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,                 // DimAccessOrder
+                                               0,                           // SrcVectorDim
+                                               SaveMeanInvStdDstVectorSize, // ScalarPerVector
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                save_inv_std_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
        constexpr auto thread_copy_bwd_step_m_k =
            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
@@ -243,7 +304,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk

        // E(x), E[x^2], var(x)
        // FIXME: Should not hack the transform from deviceOP
-        int reduce_length = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+        ComputeDataType reduce_length = type_convert<ComputeDataType>(
+            x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]);

        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
            mean_thread_buf(I)        = reduce::Add::template GetIdentityValue<ComputeDataType>();
@@ -302,10 +364,34 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                // var(x) = E[x^2] - E[x]^2
                var_thread_buf(I) =
                    mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
+
+                inv_std_thread_buf(I) = type_convert<ComputeDataType>(1.0f) /
+                                        ck::math::sqrt(var_thread_buf(I) + epsilon);
            });

+            // save mean and inverse std for backward (optional)
+            if(thread_k_cluster_id == 0)
+            {
+                if(p_save_mean_global != nullptr)
+                {
+                    threadwise_mean_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              mean_thread_buf,
+                                              save_mean_grid_desc_m,
+                                              save_mean_global_val_buf);
+                }
+                if(p_save_inv_std_global != nullptr)
+                {
+                    threadwise_inv_std_store.Run(thread_buffer_desc_m,
+                                                 make_tuple(I0),
+                                                 inv_std_thread_buf,
+                                                 save_inv_std_grid_desc_m,
+                                                 save_inv_std_global_val_buf);
+                }
+            }
+
+            // normalization
            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
                        constexpr auto offset_m_k =
@@ -314,7 +400,7 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                        // normalize
                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
-                            divisor;
+                            inv_std_thread_buf(iM);

                        // gamma & beta
                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
@@ -404,8 +490,30 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                // var(x) = E[x^2] - E[x]^2
                var_thread_buf(I) =
                    mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
+
+                inv_std_thread_buf(I) = 1 / ck::math::sqrt(var_thread_buf(I) + epsilon);
            });

+            if(thread_k_cluster_id == 0)
+            {
+                if(p_save_mean_global != nullptr)
+                {
+                    threadwise_mean_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              mean_thread_buf,
+                                              save_mean_grid_desc_m,
+                                              save_mean_global_val_buf);
+                }
+                if(p_save_inv_std_global != nullptr)
+                {
+                    threadwise_inv_std_store.Run(thread_buffer_desc_m,
+                                                 make_tuple(I0),
+                                                 inv_std_thread_buf,
+                                                 save_inv_std_grid_desc_m,
+                                                 save_inv_std_global_val_buf);
+                }
+            }
+
            auto thread_copy_tail_m_k =
                (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k;

@@ -437,7 +545,6 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                });

                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                    auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
                    static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
                        static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
                            constexpr auto offset_m_k =
@@ -446,7 +553,7 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
                            // normalize
                            y_thread_buf(iK0)(Number<offset_m_k>{}) =
                                (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
-                                divisor;
+                                inv_std_thread_buf(iM);

                            // gamma
                            y_thread_buf(iK0)(Number<offset_m_k>{}) =

--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp
@@ -12,31 +12,42 @@ template <typename GridwiseReduction,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename ComputeDataType,
          typename YElementwiseOperation,
-          typename GridDesc_M_K>
-__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
+          typename GridDesc_M_K,
+          typename GridDesc_M>
+__global__ void
+kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
                     const GridDesc_M_K gamma_grid_desc_m_k,
                     const GridDesc_M_K beta_grid_desc_m_k,
                     const GridDesc_M_K y_grid_desc_m_k,
+                     const GridDesc_M save_mean_grid_desc_m,
+                     const GridDesc_M save_inv_std_grid_desc_m,
                     index_t num_k_block_tile_iteration,
                     ComputeDataType epsilon,
                     const XDataType* const __restrict__ p_x_global,
                     const GammaDataType* const __restrict__ p_gamma_global,
                     const BetaDataType* const __restrict__ p_beta_global,
                     YDataType* const __restrict__ p_y_global,
+                     SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
+                     SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
                     const YElementwiseOperation y_elementwise_op)
 {
    GridwiseReduction::Run(x_grid_desc_m_k,
                           gamma_grid_desc_m_k,
                           beta_grid_desc_m_k,
                           y_grid_desc_m_k,
+                           save_mean_grid_desc_m,
+                           save_inv_std_grid_desc_m,
                           num_k_block_tile_iteration,
                           epsilon,
                           p_x_global,
                           p_gamma_global,
                           p_beta_global,
                           p_y_global,
+                           p_save_mean_global,
+                           p_save_inv_std_global,
                           y_elementwise_op);
 };

@@ -44,9 +55,11 @@ template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename ComputeDataType,
          typename YElementwiseOperation,
          typename GridDesc_M_K,
+          typename GridDesc_M,
          index_t BlockSize,
          index_t MThreadClusterSize,
          index_t KThreadClusterSize,
@@ -60,6 +73,7 @@ template <typename XDataType,
          index_t BetaSrcVectorSize,
          index_t YDstVectorDim,
          index_t YDstVectorSize,
+          index_t SaveMeanInvStdDstVectorSize,
          bool UseWelford>
 auto NormalizationKernelSelector(bool isSweepOnce)
 {
@@ -68,9 +82,11 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                    GammaDataType,
                                                    BetaDataType,
                                                    YDataType,
+                                                    SaveMeanInvStdDataType,
                                                    ComputeDataType,
                                                    YElementwiseOperation,
                                                    GridDesc_M_K,
+                                                    GridDesc_M,
                                                    BlockSize,
                                                    MThreadClusterSize,
                                                    KThreadClusterSize,
@@ -84,15 +100,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                    BetaSrcVectorSize,
                                                    YDstVectorDim,
                                                    YDstVectorSize,
+                                                    SaveMeanInvStdDstVectorSize,
                                                    false>;
    using GridwiseNormalizationSweepOnceNaive =
        GridwiseNormalizationNaiveVariance_mk_to_mk<XDataType,
                                                    GammaDataType,
                                                    BetaDataType,
                                                    YDataType,
+                                                    SaveMeanInvStdDataType,
                                                    ComputeDataType,
                                                    YElementwiseOperation,
                                                    GridDesc_M_K,
+                                                    GridDesc_M,
                                                    BlockSize,
                                                    MThreadClusterSize,
                                                    KThreadClusterSize,
@@ -106,15 +125,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                    BetaSrcVectorSize,
                                                    YDstVectorDim,
                                                    YDstVectorSize,
+                                                    SaveMeanInvStdDstVectorSize,
                                                    true>;
    using GridwiseNormalizationGenericWelford =
        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
+                                                      SaveMeanInvStdDataType,
                                                      ComputeDataType,
                                                      YElementwiseOperation,
                                                      GridDesc_M_K,
+                                                      GridDesc_M,
                                                      BlockSize,
                                                      MThreadClusterSize,
                                                      KThreadClusterSize,
@@ -128,15 +150,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                      BetaSrcVectorSize,
                                                      YDstVectorDim,
                                                      YDstVectorSize,
+                                                      SaveMeanInvStdDstVectorSize,
                                                      false>;
    using GridwiseNormalizationSweepOnceWelford =
        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
+                                                      SaveMeanInvStdDataType,
                                                      ComputeDataType,
                                                      YElementwiseOperation,
                                                      GridDesc_M_K,
+                                                      GridDesc_M,
                                                      BlockSize,
                                                      MThreadClusterSize,
                                                      KThreadClusterSize,
@@ -150,6 +175,7 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                      BetaSrcVectorSize,
                                                      YDstVectorDim,
                                                      YDstVectorSize,
+                                                      SaveMeanInvStdDstVectorSize,
                                                      true>;

    if constexpr(UseWelford)
@@ -159,17 +185,21 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
+                                                  SaveMeanInvStdDataType,
                                                  ComputeDataType,
                                                  YElementwiseOperation,
-                                                  GridDesc_M_K>
+                                                  GridDesc_M_K,
+                                                  GridDesc_M>
                           : kernel_normalization<GridwiseNormalizationGenericWelford,
                                                  XDataType,
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
+                                                  SaveMeanInvStdDataType,
                                                  ComputeDataType,
                                                  YElementwiseOperation,
-                                                  GridDesc_M_K>;
+                                                  GridDesc_M_K,
+                                                  GridDesc_M>;
    }
    else
    {
@@ -178,17 +208,21 @@ auto NormalizationKernelSelector(bool isSweepOnce)
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
+                                                  SaveMeanInvStdDataType,
                                                  ComputeDataType,
                                                  YElementwiseOperation,
-                                                  GridDesc_M_K>
+                                                  GridDesc_M_K,
+                                                  GridDesc_M>
                           : kernel_normalization<GridwiseNormalizationGenericNaive,
                                                  XDataType,
                                                  GammaDataType,
                                                  BetaDataType,
                                                  YDataType,
+                                                  SaveMeanInvStdDataType,
                                                  ComputeDataType,
                                                  YElementwiseOperation,
-                                                  GridDesc_M_K>;
+                                                  GridDesc_M_K,
+                                                  GridDesc_M>;
    }
 }


--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
@@ -17,11 +17,13 @@ template <typename MeanVarDataType,
          typename GammaDataType,
          typename BetaDataType,
          typename YDataType,
+          typename SaveMeanInvStdDataType,
          typename ComputeDataType,
          typename YElementwiseOperation,
          typename MeanVarGridDesc_M_KBlock,
          typename CountGridDesc_M_KBlock,
          typename XYGammaBetaGridDesc_M_K,
+          typename SaveMeanInvStdGridDesc_M,
          index_t BlockSize,
          index_t MThreadClusterSize,
          index_t KThreadClusterSize,
@@ -34,7 +36,8 @@ template <typename MeanVarDataType,
          index_t BetaSrcVectorDim,
          index_t BetaSrcVectorSize,
          index_t YDstVectorDim,
-          index_t YDstVectorSize>
+          index_t YDstVectorSize,
+          index_t SaveMeanInvStdDstVectorSize>
 struct GridwiseNormalizationSplitK2nd
 {
    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
@@ -45,6 +48,10 @@ struct GridwiseNormalizationSplitK2nd
                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");

+    static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
+                  "Invalid thread slice sizes and/or save mean and inverse std vector sizes "
+                  "configuration, please check!");
+
    static_assert(XSrcVectorSize == YDstVectorSize);
    static_assert(XSrcVectorSize == GammaSrcVectorSize);
    static_assert(XSrcVectorSize == BetaSrcVectorSize);
@@ -69,6 +76,10 @@ struct GridwiseNormalizationSplitK2nd
    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));

+    using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+
    using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
    static constexpr auto thread_buffer_desc_m_1 =
        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
@@ -99,6 +110,8 @@ struct GridwiseNormalizationSplitK2nd
                               const XYGammaBetaGridDesc_M_K& gamma_grid_desc_m_k,
                               const XYGammaBetaGridDesc_M_K& beta_grid_desc_m_k,
                               const XYGammaBetaGridDesc_M_K& y_grid_desc_m_k,
+                               const SaveMeanInvStdGridDesc_M& save_mean_grid_desc_m,
+                               const SaveMeanInvStdGridDesc_M& save_inv_std_grid_desc_m,
                               index_t num_k_mean_var_count_iteration,
                               index_t num_k_block_tile_iteration,
                               index_t k_grid_size,
@@ -110,6 +123,8 @@ struct GridwiseNormalizationSplitK2nd
                               const GammaDataType* const __restrict__ p_gamma_global,
                               const BetaDataType* const __restrict__ p_beta_global,
                               YDataType* const __restrict__ p_y_global,
+                               SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
+                               SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
                               const YElementwiseOperation y_elementwise_op)
    {
        // Thread/Block id
@@ -145,6 +160,12 @@ struct GridwiseNormalizationSplitK2nd
        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());

+        auto save_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_save_mean_global, save_mean_grid_desc_m.GetElementSpaceSize());
+
+        auto save_inv_std_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_save_inv_std_global, save_inv_std_grid_desc_m.GetElementSpaceSize());
+
        // VGPR
        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
            in_mean_thread_buf;
@@ -158,6 +179,7 @@ struct GridwiseNormalizationSplitK2nd
            var_thread_buf;
        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
            welford_count_thread_buf;
+        auto& inv_std_thread_buf = var_thread_buf;

        auto x_thread_buf = generate_tuple(
            [&](auto) {
@@ -283,6 +305,42 @@ struct GridwiseNormalizationSplitK2nd
                                     thread_k_cluster_id * YDstVectorSize),
                y_elementwise_op);

+        auto threadwise_mean_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               SaveMeanInvStdDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               SaveMeanInvStdGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,                 // DimAccessOrder
+                                               0,                           // SrcVectorDim
+                                               SaveMeanInvStdDstVectorSize, // ScalarPerVector
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                save_mean_grid_desc_m,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_inv_std_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               SaveMeanInvStdDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               SaveMeanInvStdGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,                 // DimAccessOrder
+                                               0,                           // SrcVectorDim
+                                               SaveMeanInvStdDstVectorSize, // ScalarPerVector
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                save_inv_std_grid_desc_m,
+                make_multi_index(block_m_cluster_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
        // step1: Merge mean and variance
        constexpr auto mean_var_count_thread_copy_step_I0_k =
            make_multi_index(I0, KThreadClusterSize);
@@ -332,9 +390,33 @@ struct GridwiseNormalizationSplitK2nd

            BlockwiseWelford::Run(
                mean_thread_buf(I), var_thread_buf(I), welford_count_thread_buf(I));
+
+            inv_std_thread_buf(I) =
+                type_convert<ComputeDataType>(1.0f) / ck::math::sqrt(var_thread_buf(I) + epsilon);
        });

-        // step2: normalization
+        // step2: save mean and inverse std for backward (optional)
+        if(block_k_cluster_id == 0 && thread_k_cluster_id == 0)
+        {
+            if(p_save_mean_global != nullptr)
+            {
+                threadwise_mean_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          mean_thread_buf,
+                                          save_mean_grid_desc_m,
+                                          save_mean_global_val_buf);
+            }
+            if(p_save_inv_std_global != nullptr)
+            {
+                threadwise_inv_std_store.Run(thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             inv_std_thread_buf,
+                                             save_inv_std_grid_desc_m,
+                                             save_inv_std_global_val_buf);
+            }
+        }
+
+        // step3: normalization
        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);

        for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
@@ -360,7 +442,6 @@ struct GridwiseNormalizationSplitK2nd
            });

            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
                static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
                        constexpr auto offset_m_k =
@@ -369,7 +450,7 @@ struct GridwiseNormalizationSplitK2nd
                        // normalize
                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
-                            divisor;
+                            inv_std_thread_buf(iM);

                        // gamma
                        y_thread_buf(iK0)(Number<offset_m_k>{}) =

--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -462,7 +462,6 @@ struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
    }
 };

-#if defined CK_ENABLE_FP8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8f8>
 {
@@ -506,9 +505,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
        intrin_mfma_f32_16x16x32f8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif

-#if defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8bf8>
 {
@@ -552,9 +549,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8bf8>
        intrin_mfma_f32_16x16x32bf8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8bf8>
 {
@@ -598,9 +593,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8bf8>
        intrin_mfma_f32_16x16x32f8bf8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x16bf8f8>
 {
@@ -644,7 +637,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
        intrin_mfma_f32_16x16x32bf8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
    }
 };
-#endif

 template <typename base_type,
          index_t MPerXdlops,
@@ -792,7 +784,6 @@ struct MfmaSelector
    }
 #endif

-#if defined CK_ENABLE_FP8
    template <>
    static constexpr auto GetMfma<f8_t, 32, 32>()
    {
@@ -804,9 +795,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32f8f8;
    }
-#endif

-#if defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<bf8_t, 32, 32>()
    {
@@ -818,9 +807,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32bf8bf8;
    }
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
    {
@@ -832,9 +819,7 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32f8bf8;
    }
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
    template <>
    static constexpr auto GetMfma<bf8_t, 32, 32, f8_t>()
    {
@@ -846,7 +831,6 @@ struct MfmaSelector
    {
        return MfmaInstr::mfma_f32_16x16x32bf8f8;
    }
-#endif

    static constexpr auto selected_mfma =
        mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type>()>{};
@@ -1051,18 +1035,10 @@ struct XdlopsGemm
        static_assert(
            is_same<base_type, double>::value || is_same<base_type, float>::value ||
                is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
-                is_same<base_type, int8_t>::value
-#if defined CK_ENABLE_FP8
-                || is_same<base_type, f8_t>::value
-#endif
-#if defined CK_ENABLE_BF8
-                || is_same<base_type, bf8_t>::value
-#endif
-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-                || (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
-                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value)
-#endif
-                ,
+                is_same<base_type, int8_t>::value || is_same<base_type, f8_t>::value ||
+                is_same<base_type, bf8_t>::value ||
+                (is_same<base_type, f8_t>::value && is_same<additional_type, bf8_t>::value) ||
+                (is_same<base_type, bf8_t>::value && is_same<additional_type, f8_t>::value),
            "base base_type must be double, float, half, bfloat16, int8_t, f8_t or bf8_t!");

        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#ifndef CK_AMD_XDLOPS_HPP
-#define CK_AMD_XDLOPS_HPP
-
-#include "data_type.hpp"
+#pragma once

 namespace ck {

@@ -355,7 +352,6 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
    }
 };

-#if defined CK_ENABLE_FP8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8f8;

@@ -418,9 +414,7 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16>
 #endif
    }
 };
-#endif

-#if defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16bf8bf8;

@@ -483,9 +477,7 @@ struct intrin_mfma_f32_16x16x32bf8bf8<16, 16>
 #endif
    }
 };
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16f8bf8;

@@ -548,9 +540,7 @@ struct intrin_mfma_f32_16x16x32f8bf8<16, 16>
 #endif
    }
 };
-#endif

-#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_32x32x16bf8f8;

@@ -613,6 +603,5 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
 #endif
    }
 };
-#endif
+
 } // namespace ck
-#endif