Merge develop

ab663329 · aska-0096 · 4fec5ad3 · 8a4253ba · ab663329 · ab663329
Commit ab663329 authored Nov 07, 2022 by aska-0096
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -150,7 +150,10 @@ template <typename ADataType,
          ck::index_t BBlockTransferDstScalarPerVector_K1,
          bool BBlockLdsAddExtraN,
          ck::index_t CThreadTransferSrcDstVectorDim,
-          ck::index_t CThreadTransferDstScalarPerVector>
+          ck::index_t CThreadTransferDstScalarPerVector,
+          ck::index_t NumGemmKPrefetchStage = 1,
+          ck::LoopScheduler LoopSched       = make_default_loop_scheduler(),
+          ck::PipelineVersion PipelineVer   = ck::PipelineVersion::v1>
 struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                                       BLayout,
                                                       CLayout,
@@ -323,7 +326,10 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
                                                BBlockLdsAddExtraN,
                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
                                                CThreadTransferSrcDstVectorDim,
-                                                CThreadTransferDstScalarPerVector>;
+                                                CThreadTransferDstScalarPerVector,
+                                                NumGemmKPrefetchStage,
+                                                LoopSched,
+                                                PipelineVer>;
    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
@@ -622,6 +628,12 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
    {
        auto str = std::stringstream();
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
        // clang-format off
        str << "DeviceBatchedGemmXdl"
            << "<"
@@ -629,7 +641,13 @@ struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock
-            << ">";
+            << ">"
+            << " NumGemmKPrefetchStage: "
+            << NumGemmKPrefetchStage << ", "
+            << "LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -214,6 +214,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
                                     MPerBlock,
                                     NPerBlock,
                                     K0PerBlock,
+                                     K1,
                                     M1PerThread,
                                     N1PerThread,
                                     KPerThread,

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -141,7 +141,8 @@ template <typename ALayout,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                                                     BLayout,
                                                                     DsLayout,
@@ -282,7 +283,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        CShuffleNXdlPerWavePerShuffle,
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
+        LoopSched,
+        PipelineVer>;
    // desc for blockwise copy
    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
@@ -664,6 +666,12 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
    {
        auto str = std::stringstream();
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
        // clang-format off
        str << "DeviceGemmMultipleD_Xdl_CShuffle"
            << "<"
@@ -674,7 +682,11 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            << AK1 << ", "
            << BK1 << ", "
            << getGemmSpecializationString(GemmSpec)
-            << ">";
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -56,7 +56,9 @@ template <typename ADataType,
          bool BBlockLdsAddExtraN,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector,
-          ck::index_t NumPrefetch = 1>
+          ck::index_t NumPrefetch         = 1,
+          ck::LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
 struct DeviceGemmXdl : public DeviceGemm<ALayout,
                                         BLayout,
                                         CLayout,
@@ -230,7 +232,9 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
        Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
        CThreadTransferSrcDstVectorDim,
        CThreadTransferDstScalarPerVector,
-        NumPrefetch>;
+        NumPrefetch,
+        LoopSched,
+        PipelineVer>;
    // Argument
    struct Argument : public BaseArgument
@@ -523,6 +527,12 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
    {
        auto str = std::stringstream();
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
        // clang-format off
        str << "DeviceGemmXdl"
            << "<"
@@ -535,7 +545,13 @@ struct DeviceGemmXdl : public DeviceGemm<ALayout,
            << NPerXDL << ", "
            << MXdlPerWave << ", "
            << NXdlPerWave
-            << ">";
+            << ">"
+            << " NumPrefetch: "
+            << NumPrefetch << ", "
+            << "LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -64,7 +64,8 @@ template <typename ALayout,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
                                                   BLayout,
                                                   CLayout,
@@ -393,7 +394,8 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
        CShuffleNXdlPerWavePerShuffle,
        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
+        LoopSched,
+        PipelineVer>;
    // Argument
    struct Argument : public BaseArgument
@@ -656,6 +658,12 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
    {
        auto str = std::stringstream();
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
        // clang-format off
        str << "DeviceGemm_Xdl_CShuffle"
            << "<"
@@ -665,7 +673,11 @@ struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
            << KPerBlock << ", "
            << AK1 << ", "
            << BK1
-            << ">";
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];;
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -24,7 +24,7 @@ template <typename GridwiseReduction,
          typename AccDataType,
          typename AccElementwiseOperation,
          typename GridDesc_M_K>
-__global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k,
+__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
                                     const GridDesc_M_K gamma_grid_desc_m_k,
                                     const GridDesc_M_K beta_grid_desc_m_k,
                                     const GridDesc_M_K y_grid_desc_m_k,
@@ -54,7 +54,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-// Y = LayerNorm(X, Beta, Gamma)
+// Y = Normalization(X, Beta, Gamma)
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
@@ -168,7 +168,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
    using GridwiseReduceLayernormGeneric =
-        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
+        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
@@ -189,8 +189,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                      XYSrcVectorDim,
                                                      YDstVectorSize,
                                                      false>;
-    using GridwiseReduceLayernormSweepOnce =
+    using GridwiseNormalizationSweepOnce =
-        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
+        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
                                                      GammaDataType,
                                                      BetaDataType,
                                                      YDataType,
@@ -295,7 +295,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
            const auto kernel_main = arg.isSweeponce_
-                                         ? kernel_layernorm<GridwiseReduceLayernormSweepOnce,
+                                         ? kernel_normalization<GridwiseNormalizationSweepOnce,
                                                                XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
@@ -303,7 +303,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                                                                AccDataType,
                                                                AccElementwiseOperation,
                                                                GridDesc_M_K>
-                                         : kernel_layernorm<GridwiseReduceLayernormGeneric,
+                                         : kernel_normalization<GridwiseReduceLayernormGeneric,
                                                                XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
@@ -426,8 +426,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
                        const void* p_gamma,
                        const void* p_beta,
                        void* p_y,
+                        void* p_saveMean,
+                        void* p_saveInvVar,
                        AccElementwiseOperation acc_elementwise_op) override
    {
+        // TODO
+        // Optional cache of the intermediate results (mean and InvVariance) during the
+        // forward pass could speedup in the backward
+        ignore = p_saveMean;
+        ignore = p_saveInvVar;
        return std::make_unique<Argument>(lengths,
                                          xStrides,
                                          gammaStrides,

--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -226,6 +226,30 @@ struct DeviceReduceMultiBlock
              in_elementwise_op_{in_elementwise_op},
              acc_elementwise_op_{acc_elementwise_op}
        {
+            if(Rank != inLengths.size() || Rank != inStrides.size() ||
+               NumReduceDim != reduceDims.size())
+            {
+                throw std::runtime_error(
+                    "One of inLengths/inStrides/reduceDims has invalid size!"
+                    "\nExpected size inLengths: " +
+                    std::to_string(Rank) + ", inStrides: " + std::to_string(Rank) +
+                    ", reduceDims: " + std::to_string(NumReduceDim) +
+                    "\nBut have inLengths: " + std::to_string(inLengths.size()) +
+                    ", inStrides: " + std::to_string(inStrides.size()) +
+                    ", reduceDims: " + std::to_string(reduceDims.size()));
+            }
+            for(std::size_t i = 0; i < reduceDims.size(); ++i)
+            {
+                if(reduceDims[i] < 0 || reduceDims[i] >= Rank)
+                {
+                    throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!"
+                                             "\nHave reduceDims[" +
+                                             std::to_string(i) +
+                                             "]: " + std::to_string(reduceDims[i]));
+                }
+            }
            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);

--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -42,6 +42,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
 {
    static constexpr index_t kRank            = Rank;
    static constexpr index_t kNumReduceDim    = NumReduceDim;
+    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
    virtual index_t GetRank() const override { return kRank; }
@@ -168,6 +169,30 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
              in_elementwise_op_{in_elementwise_op},
              acc_elementwise_op_{acc_elementwise_op}
        {
+            if(Rank != inLengths.size() || Rank != inStrides.size() ||
+               NumReduceDim != reduceDims.size())
+            {
+                throw std::runtime_error(
+                    "One of inLengths/inStrides/reduceDims has invalid size!"
+                    "\nExpected size inLengths: " +
+                    std::to_string(Rank) + ", inStrides: " + std::to_string(Rank) +
+                    ", reduceDims: " + std::to_string(NumReduceDim) +
+                    "\nBut have inLengths: " + std::to_string(inLengths.size()) +
+                    ", inStrides: " + std::to_string(inStrides.size()) +
+                    ", reduceDims: " + std::to_string(reduceDims.size()));
+            }
+            for(std::size_t i = 0; i < reduceDims.size(); ++i)
+            {
+                if(reduceDims[i] < 0 || reduceDims[i] >= Rank)
+                {
+                    throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!"
+                                             "\nHave reduceDims[" +
+                                             std::to_string(i) +
+                                             "]: " + std::to_string(reduceDims[i]));
+                }
+            }
            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
@@ -257,40 +282,78 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
        };
    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    static bool IsSupportedArgument(const Argument& arg)
    {
-        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(NumInvariantDim == 0)
+            if constexpr(kNumInvariantDim == 0)
            {
                return false;
            }
            else
            {
-                if(p_arg_->inStrides_[NumInvariantDim - 1] != 1)
+                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                {
                    return false;
+                }
-                if(p_arg_->invariant_lowest_length_ % InSrcVectorSize != 0)
+                if(arg.invariant_lowest_length_ % InSrcVectorSize != 0)
+                {
                    return false;
-            };
+                }
+            }
        }
        else
        {
-            if(p_arg_->inStrides_[Rank - 1] != 1)
+            if(arg.inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
+            {
                return false;
+            }
+            if(arg.inLengths_[Rank - 1] % InSrcVectorSize != 0)
+            {
+                return false;
+            }
+        }
-            if(p_arg_->inLengths_[Rank - 1] % InSrcVectorSize != 0)
+        // To improve
+        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        {
            return false;
-        };
+        }
-        if(p_arg_->invariant_lowest_length_ % OutDstVectorSize != 0)
+        if(arg.inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
            return false;
+        }
        return true;
    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(const std::vector<index_t> inLengths,
+                             const std::vector<index_t> inStrides,
+                             const std::vector<int> reduceDims,
+                             const AccDataType alpha,
+                             const AccDataType beta,
+                             const InDataType* in_dev,
+                             OutDataType* out_dev,
+                             InElementwiseOp in_elementwise_op,
+                             AccElementwiseOp acc_elementwise_op)
+    {
+        return Argument{inLengths,
+                        inStrides,
+                        reduceDims,
+                        alpha,
+                        beta,
+                        in_dev,
+                        out_dev,
+                        in_elementwise_op,
+                        acc_elementwise_op};
+    };
    //
    // @brief      Makes a pointer to Argument class.
    //
@@ -330,6 +393,8 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                                          acc_elementwise_op);
    };
+    static auto MakeInvoker() { return Invoker{}; }
    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
    {
        return std::make_unique<Invoker>();
@@ -340,10 +405,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceReduceSoftmax<" << BlockSize << ",";
+        str << "DeviceReduceSoftmax<" 
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+            << Rank << "," << NumReduceDim << "," << BlockSize << ","
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+            << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+            << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","
+            << "InSrcVectorDim_" << InSrcVectorDim 
+            << "_InSrcVectorSize_" << InSrcVectorSize 
+            << "_OutDstVectorSize_" << OutDstVectorSize << ">";
        // clang-format on
        return str.str();

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -7,6 +7,7 @@
 #include "ck/utility/math_v2.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/quantization_operation.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
+#pragma once
+#include "ck/utility/data_type.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+template <typename Activation>
+struct Activation_Mul_Clamp
+{
+    Activation_Mul_Clamp(float multiplier, Activation activationOp)
+        : multiplier_(multiplier), activationOp_(activationOp)
+    {
+    }
+    __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
+    {
+        float x_fp32 = ck::type_convert<float>(x);
+        activationOp_(x_fp32, x_fp32);
+        float y_fp32 = math::clamp(multiplier_ * x_fp32, -128.f, 127.f);
+        y            = ck::type_convert<int8_t>(y_fp32);
+    }
+    __host__ __device__ constexpr void operator()(float& y, const int32_t& x) const
+    {
+        // We might type_convert to int8 after lambda in someplace
+        float x_fp32 = ck::type_convert<float>(x);
+        activationOp_(x_fp32, x_fp32);
+        y = math::clamp(multiplier_ * x_fp32, -128.f, 127.f);
+    }
+    float multiplier_;
+    Activation activationOp_;
+};
+// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+template <typename Activation>
+struct Add_Activation_Mul_Clamp
+{
+    Add_Activation_Mul_Clamp(float multiplier, Activation activationOp)
+        : multiplier_(multiplier), activationOp_(activationOp)
+    {
+    }
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x1, const int32_t& x2) const
+    {
+        float y_fp32 = ck::type_convert<float>(x1 + x2);
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(multiplier_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+    float multiplier_;
+    Activation activationOp_;
+};
+// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+template <typename Activation>
+struct Add_Mul_Activation_Mul_Clamp
+{
+    Add_Mul_Activation_Mul_Clamp(float multiplier1, float multiplier2, Activation activationOp)
+        : multiplier1_(multiplier1), multiplier2_(multiplier2), activationOp_(activationOp)
+    {
+    }
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x1, const int32_t& x2) const
+    {
+        float y_fp32 = ck::type_convert<float>(x1 + x2);
+        y_fp32       = multiplier1_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(multiplier2_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+    float multiplier1_;
+    float multiplier2_;
+    Activation activationOp_;
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -4,6 +4,7 @@
 #pragma once
 #include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
 #include "ck/utility/math_v2.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
@@ -74,7 +74,8 @@ template <typename FloatAB,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
-          LoopScheduler LoopSched>
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedGemmGemm_Xdl_CShuffle
 {
    static_assert(LoopSched == LoopScheduler::Default,
@@ -101,7 +102,8 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
-    using GridwiseGemmPipe = GridwiseGemmPipeline_v1<NumGemmKPrefetchStage>;
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
    template <typename ABlockDesc_AK0_M_AK1>
    __host__ __device__ static constexpr auto
@@ -486,8 +488,9 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
        // gridwise GEMM pipeline
        // Only supports LoopScheduler::Default
-        const auto gridwise_gemm_pipeline =
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_Selector<PipelineVer,
-            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopScheduler::Default>();
+                                                                          NumGemmKPrefetchStage,
+                                                                          LoopScheduler::Default>();
        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp