Merge branch 'amd-develop-staging-0423' into amd-master

6b9a4bd5 · Jun Liu · 56de337f · c5f1cdf7 · 6b9a4bd5 · 6b9a4bd5
Commit 6b9a4bd5 authored Apr 23, 2024 by Jun Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -22,10 +22,12 @@ namespace device {
 template <typename InDataTypeTuple,
          typename OutDataTypeTuple,
          typename ElementwiseOperation,
-          index_t NumDim,
+          index_t NumDim,                 // The max dim of input tensors
-          index_t MPerThread,
+                                          // the tensors descs have to be aligned, such that
-          typename InScalarPerVectorSeq,
+                                          // the innermost dim is the contiguous one.
-          typename OutScalarPerVectorSeq>
+          index_t MPerThread,             // How many elements per thread to read
+          typename InScalarPerVectorSeq,  // Scalar per vec for each Input
+          typename OutScalarPerVectorSeq> // Scalar per vec for each Output
 struct DeviceElementwiseImpl
    : public DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
 {
@@ -242,13 +244,13 @@ struct DeviceElementwiseImpl
        static_for<0, NumInput, 1>{}([&](auto I) {
            if(!IsScalarPerVectorValid(
                   arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
-                valid = false;
+                valid = valid && false;
        });
        static_for<0, NumOutput, 1>{}([&](auto I) {
            if(!IsScalarPerVectorValid(
                   arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
-                valid = false;
+                valid = valid && false;
        });
        return valid;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -254,13 +254,14 @@ template <index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          typename ComputeDataType =
+          typename AComputeDataType =
              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
                                      Number<0>,
                                      ADataType>()), // ComputeType is InputType by default (first
                                                     // in tuple for MultiAB), unpack if tuple was
                                                     // passed
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          typename BComputeDataType = AComputeDataType,
+          LoopScheduler LoopSched   = make_default_loop_scheduler()>
 struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    : public DeviceGroupedConvFwdMultipleABD<NDimSpatial,
                                             ALayout,
@@ -274,7 +275,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                                             AElementwiseOperation,
                                             BElementwiseOperation,
                                             CDEElementwiseOperation,
-                                             ComputeDataType>
+                                             AComputeDataType,
+                                             BComputeDataType>
 {
    using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle;
@@ -386,7 +388,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    using GemmBDataType = std::conditional_t<!isMultiB && isMultiA, Tuple<BDataType>, BDataType>;
 #define GridwiseGemmTemplateParameters                                                          \
-    GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType,   \
+    GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType,  \
        EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,       \
        InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \
        KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                        \
@@ -399,7 +401,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                           \
        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                           \
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
-        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,              \
+        BComputeDataType
    // Use appropriate gridwise gemm
    using GridwiseGemm =
        std::conditional_t<isMultiA || isMultiB,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -75,13 +75,14 @@ template <index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          typename ComputeDataType =
+          typename AComputeDataType =
              decltype(UnpackDataType<is_detected<is_tuple, ADataType>::value,
                                      Number<0>,
                                      ADataType>()), // ComputeType is InputType by default (first
                                                     // in tuple for MultiAB), unpack if tuple was
                                                     // passed
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          typename BComputeDataType = AComputeDataType,
+          LoopScheduler LoopSched   = make_default_loop_scheduler()>
 using DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
    NDimSpatial,
    ALayout,
@@ -128,7 +129,8 @@ using DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle = DeviceGroupedConvFwdMultipl
    CShuffleNXdlPerWavePerShuffle,
    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
    CDEBlockTransferScalarPerVector_NPerBlock,
-    ComputeDataType,
+    AComputeDataType,
+    BComputeDataType,
    LoopSched>;
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -23,6 +23,7 @@ namespace device {
 template <typename GridwiseGemm,
          typename GemmDesc,
          GemmSpecialization GemmSpec,
+          bool Zeroing,
          typename ALayout,
          typename BLayout,
          typename DsLayout,
@@ -106,33 +107,63 @@ __global__ void
        const auto block_2_etile_map =
            GroupedGemmBlock2ETileMap(local_b2e_tile_map, BlockStart, id_off);
-        auto barrier_count_finished =
+        if constexpr(Zeroing)
-            barrier_count + group_id * barrier_size_grp + id_local % mn_blocks;
+        {
+            auto barrier_count_finished =
-        GridwiseGemm::template Run<HasMainKBlockLoop,
+                barrier_count + group_id * barrier_size_grp + id_local % mn_blocks;
-                                   EGlobalMemoryDataOperation,
+            GridwiseGemm::template RunWithZeroing<HasMainKBlockLoop,
-                                   GemmSpec,
+                                                  EGlobalMemoryDataOperation,
-                                   ALayout,
+                                                  GemmSpec,
-                                   BLayout,
+                                                  ALayout,
-                                   DsLayout,
+                                                  BLayout,
-                                   ELayout>(gemm_desc_ptr[group_id].p_a_grid,
+                                                  DsLayout,
-                                            gemm_desc_ptr[group_id].p_b_grid,
+                                                  ELayout>(gemm_desc_ptr[group_id].p_a_grid,
-                                            p_ds_grid_,
+                                                           gemm_desc_ptr[group_id].p_b_grid,
-                                            gemm_desc_ptr[group_id].p_e_grid,
+                                                           p_ds_grid_,
-                                            p_shared,
+                                                           gemm_desc_ptr[group_id].p_e_grid,
-                                            barrier_count_finished,
+                                                           p_shared,
-                                            a_element_op,
+                                                           barrier_count_finished,
-                                            b_element_op,
+                                                           a_element_op,
-                                            c_element_op,
+                                                           b_element_op,
-                                            M,
+                                                           c_element_op,
-                                            N,
+                                                           M,
-                                            K,
+                                                           N,
-                                            StrideA,
+                                                           K,
-                                            StrideB,
+                                                           StrideA,
-                                            StrideDs,
+                                                           StrideB,
-                                            StrideE,
+                                                           StrideDs,
-                                            KBatch,
+                                                           StrideE,
-                                            block_2_etile_map);
+                                                           KBatch,
+                                                           block_2_etile_map);
+        }
+        else
+        {
+            GridwiseGemm::template Run<HasMainKBlockLoop,
+                                       EGlobalMemoryDataOperation,
+                                       GemmSpec,
+                                       ALayout,
+                                       BLayout,
+                                       DsLayout,
+                                       ELayout>(gemm_desc_ptr[group_id].p_a_grid,
+                                                gemm_desc_ptr[group_id].p_b_grid,
+                                                p_ds_grid_,
+                                                gemm_desc_ptr[group_id].p_e_grid,
+                                                p_shared,
+                                                nullptr,
+                                                a_element_op,
+                                                b_element_op,
+                                                c_element_op,
+                                                M,
+                                                N,
+                                                K,
+                                                StrideA,
+                                                StrideB,
+                                                StrideDs,
+                                                StrideE,
+                                                KBatch,
+                                                block_2_etile_map);
+        }
        id_off += grid_size_grp;
        id_local += grid_size_grp;
@@ -193,8 +224,11 @@ template <typename ALayout,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CDEBlockTransferScalarPerVector_NPerBlock,
-          typename ComputeType    = ADataType,
+          PipelineVersion PipelineVer = PipelineVersion::v1,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          typename ComputeType        = ADataType,
+          typename ALDSType           = ComputeType,
+          typename BLDSType           = ComputeType>
 struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
                                                                        BLayout,
                                                                        DsLayout,
@@ -215,11 +249,15 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};
+    using AComputeType = ComputeType;
+    using BComputeType = ComputeType;
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemmMultipleD_xdl_splitk_cshuffle<
        ADataType, // TODO: distinguish A/B datatype
        BDataType,
-        ComputeType,
+        AComputeType,
+        BComputeType,
        AccDataType,
        CShuffleDataType,
        DsDataType,
@@ -258,7 +296,10 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
        CShuffleNXdlPerWavePerShuffle,
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched>;
+        LoopSched,
+        PipelineVer,
+        ALDSType,
+        BLDSType>;
    template <typename UnderlyingBlockToCTileMap>
    struct OffsettedBlockToCTileMapMLoops
@@ -613,45 +654,85 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
            float ave_time = 0;
            auto launch_kernel = [&](auto has_main_k_block_loop_, auto e_global_memory_operation_) {
-                const auto kernel =
+                if(arg.k_batch_ == 1)
-                    kernel_grouped_gemm_xdl_fixed_nk<GridwiseGemm,
+                {
-                                                     GroupedGemmKernelArgument<NumDTensor>,
+                    const auto kernel =
-                                                     GemmSpec,
+                        kernel_grouped_gemm_xdl_fixed_nk<GridwiseGemm,
-                                                     ALayout,
+                                                         GroupedGemmKernelArgument<NumDTensor>,
-                                                     BLayout,
+                                                         GemmSpec,
-                                                     DsLayout,
+                                                         false,
-                                                     ELayout,
+                                                         ALayout,
-                                                     DsDataType,
+                                                         BLayout,
-                                                     Block2ETileMap,
+                                                         DsLayout,
-                                                     GroupedGemmBlock2ETileMap,
+                                                         ELayout,
-                                                     AElementwiseOperation,
+                                                         DsDataType,
-                                                     BElementwiseOperation,
+                                                         Block2ETileMap,
-                                                     CDEElementwiseOperation,
+                                                         GroupedGemmBlock2ETileMap,
-                                                     e_global_memory_operation_,
+                                                         AElementwiseOperation,
-                                                     has_main_k_block_loop_>;
+                                                         BElementwiseOperation,
+                                                         CDEElementwiseOperation,
-                return launch_and_time_kernel(
+                                                         e_global_memory_operation_,
-                    stream_config,
+                                                         has_main_k_block_loop_>;
-                    kernel,
-                    dim3(arg.grid_size_),
+                    return launch_and_time_kernel(
-                    dim3(BlockSize),
+                        stream_config,
-                    0,
+                        kernel,
-                    cast_pointer_to_constant_address_space(arg.grouped_gemm_kernel_args_dev),
+                        dim3(arg.grid_size_),
-                    reinterpret_cast<uint32_t*>(arg.p_workspace_),
+                        dim3(BlockSize),
-                    arg.barrier_size_grp_,
+                        0,
-                    arg.gemm_desc_kernel_arg_.size(),
+                        cast_pointer_to_constant_address_space(arg.grouped_gemm_kernel_args_dev),
-                    arg.grid_size_grp_,
+                        nullptr,
-                    arg.k_batch_,
+                        arg.barrier_size_grp_,
-                    arg.a_element_op_,
+                        arg.gemm_desc_kernel_arg_.size(),
-                    arg.b_element_op_,
+                        arg.grid_size_grp_,
-                    arg.c_element_op_);
+                        arg.k_batch_,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_);
+                }
+                else
+                {
+                    const auto kernel =
+                        kernel_grouped_gemm_xdl_fixed_nk<GridwiseGemm,
+                                                         GroupedGemmKernelArgument<NumDTensor>,
+                                                         GemmSpec,
+                                                         true,
+                                                         ALayout,
+                                                         BLayout,
+                                                         DsLayout,
+                                                         ELayout,
+                                                         DsDataType,
+                                                         Block2ETileMap,
+                                                         GroupedGemmBlock2ETileMap,
+                                                         AElementwiseOperation,
+                                                         BElementwiseOperation,
+                                                         CDEElementwiseOperation,
+                                                         e_global_memory_operation_,
+                                                         has_main_k_block_loop_>;
+                    return launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(arg.grid_size_),
+                        dim3(BlockSize),
+                        0,
+                        cast_pointer_to_constant_address_space(arg.grouped_gemm_kernel_args_dev),
+                        reinterpret_cast<uint32_t*>(arg.p_workspace_),
+                        arg.barrier_size_grp_,
+                        arg.gemm_desc_kernel_arg_.size(),
+                        arg.grid_size_grp_,
+                        arg.k_batch_,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_);
+                }
            };
            constexpr auto AtomicAdd = InMemoryDataOperationEnum::AtomicAdd;
            constexpr auto Set       = InMemoryDataOperationEnum::Set;
-            // For bf16 datatype only kbatch = 1 scenario is supported. This condition is enforced
+            // For bf16 datatype only kbatch = 1 scenario is supported. This condition is
-            // in IsSupportedArgument function
+            // enforced in IsSupportedArgument function
            if constexpr(std::is_same<ADataType, ck::bhalf_t>::value)
            {
                if(has_main_k_block_loop)
@@ -719,12 +800,12 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
        bool supported = true;
-        // If we use padding we do not support vector loads for dimensions not divisible by vector
+        // If we use padding we do not support vector loads for dimensions not divisible by
-        // load size.
+        // vector load size.
        if constexpr(GemmSpec != GemmSpecialization::Default)
        {
-            // [A|B]BlockTransferSrcVectorDim value define dimension in the block {K0,M,K1} layout,
+            // [A|B]BlockTransferSrcVectorDim value define dimension in the block {K0,M,K1}
-            // thus we have to adapt it to the {M,K} or {N,K} layout.
+            // layout, thus we have to adapt it to the {M,K} or {N,K} layout.
            const auto a_raw_vector_dim = ABlockTransferSrcVectorDim != 1 ? 1 : 0;
            const auto b_raw_vector_dim = BBlockTransferSrcVectorDim != 1 ? 1 : 0;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -26,13 +26,19 @@ namespace device {
 template <typename GridwiseGemm,
          typename GemmDesc,
          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation>
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AElementwiseOperation = ck::tensor_operation::element_wise::PassThrough,
+          typename BElementwiseOperation = ck::tensor_operation::element_wise::PassThrough,
+          typename CElementwiseOperation = ck::tensor_operation::element_wise::PassThrough>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
        kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                       const index_t group_count)
+                                       const index_t group_count,
+                                       const AElementwiseOperation a_element_op,
+                                       const BElementwiseOperation b_element_op,
+                                       const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
@@ -64,10 +70,16 @@ __global__ void
    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation>(
        gemm_desc_ptr[group_id].karg_,
        static_cast<void*>(p_shared),
-        gemm_desc_ptr[group_id].block_2_ctile_map_);
+        gemm_desc_ptr[group_id].block_2_ctile_map_,
+        a_element_op,
+        b_element_op,
+        c_element_op);
 #else
    ignore = gemm_descs_const;
    ignore = group_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
@@ -193,7 +205,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
    static constexpr index_t B2E_M01 = 8;
    using GroupedGemmBlock2ETileMap  = OffsettedBlockToCTileMap<Block2ETileMapKSplit>;
    using KernelArgument             = typename GridwiseGemm::Argument;
+    using PassThrough                = ck::tensor_operation::element_wise::PassThrough;
    struct GemmTransKernelArg
    {
        KernelArgument karg_;
@@ -437,7 +449,10 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                                           dim3(BlockSize),
                                           0,
                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
-                                           arg.gemm_kernel_args_.size());
+                                           arg.gemm_kernel_args_.size(),
+                                           PassThrough{},
+                                           PassThrough{},
+                                           PassThrough{});
            };
            if(all_have_main_k0_block_loop)

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -92,6 +92,110 @@ struct Add
    };
 };
+struct Max
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1) const
+    {
+        const Y x0_converted = type_convert<Y>(x0);
+        const Y x1_converted = type_convert<Y>(x1);
+        y                    = ck::math::max(x0_converted, x1_converted);
+    }
+};
+struct Min
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1) const
+    {
+        const Y x0_converted = type_convert<Y>(x0);
+        const Y x1_converted = type_convert<Y>(x1);
+        y                    = ck::math::min(x0_converted, x1_converted);
+    }
+};
+struct Multiply
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        y = x0 * x1;
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = x0 * x1;
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const half_t& x1) const
+    {
+        y = x0 * type_convert<half_t>(x1);
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const float& x1) const
+    {
+        y = type_convert<half_t>(x0 * x1);
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(x0) * x1;
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = x0 * x1;
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x1);
+        y                  = x0 * x1_tmp;
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x0);
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x1_tmp * x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
+    {
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x0 * x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        y = x0 * x1;
+    };
+};
 struct ScaleAdd
 {
    __host__ __device__ ScaleAdd(float scale = 1.f) : scale_(scale) {}

--- a/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+// y = UnaryOp0(UnaryOp1(...(x)))
+template <typename... UnaryOpsSet>
+struct UnaryCombinedOp
+{
+    __host__ __device__ UnaryCombinedOp(UnaryOpsSet... unary_ops) : unary_ops_(unary_ops...) {}
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        // Execute first unary op to copy data to y
+        unary_ops_.At(Number<0>{})(y, x);
+        static_for<1, Tuple<UnaryOpsSet...>::Size(), 1>{}([&](auto i) { unary_ops_.At(i)(y, y); });
+    };
+    Tuple<UnaryOpsSet...> unary_ops_;
+};
+// y = BinaryOp(UnaryOp0(x0), UnaryOp1(x1))
+template <typename BinaryOp, typename UnaryOp0, typename UnaryOp1>
+struct BinaryWithUnaryCombinedOp
+{
+    __host__ __device__ BinaryWithUnaryCombinedOp(BinaryOp binary_op,
+                                                  UnaryOp0 unary_op0,
+                                                  UnaryOp1 unary_op1)
+        : binary_op_(binary_op), unary_op0_(unary_op0), unary_op1_(unary_op1)
+    {
+    }
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1) const
+    {
+        Y unary_x0_tmp_result;
+        Y unary_x1_tmp_result;
+        unary_op0_(unary_x0_tmp_result, x0);
+        unary_op1_(unary_x1_tmp_result, x1);
+        binary_op_(y, unary_x0_tmp_result, unary_x1_tmp_result);
+    };
+    private:
+    BinaryOp binary_op_;
+    UnaryOp0 unary_op0_;
+    UnaryOp1 unary_op1_;
+};
+// y = BinaryOp0(BinaryOp1(UnaryOp0(x0), UnaryOp1(x1)), UnaryOp2(x2))
+template <typename BinaryOp0,
+          typename BinaryOp1,
+          typename UnaryOp0,
+          typename UnaryOp1,
+          typename UnaryOp2>
+struct TrinaryWithUnaryCombinedOp
+{
+    __host__ __device__ TrinaryWithUnaryCombinedOp(BinaryOp0 binary_op0,
+                                                   BinaryOp0 binary_op1,
+                                                   UnaryOp0 unary_op0,
+                                                   UnaryOp1 unary_op1,
+                                                   UnaryOp2 unary_op2)
+        : binary_op0_(binary_op0),
+          binary_op1_(binary_op1),
+          unary_op0_(unary_op0),
+          unary_op1_(unary_op1),
+          unary_op2_(unary_op2)
+    {
+    }
+    template <typename Y, typename X0, typename X1, typename X2>
+    __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1, const X2& x2) const
+    {
+        Y unary_x0_tmp_result;
+        Y unary_x1_tmp_result;
+        Y unary_x2_tmp_result;
+        unary_op0_(unary_x0_tmp_result, x0);
+        unary_op1_(unary_x1_tmp_result, x1);
+        unary_op2_(unary_x2_tmp_result, x2);
+        binary_op0_(unary_x0_tmp_result, unary_x0_tmp_result, unary_x1_tmp_result);
+        binary_op1_(y, unary_x0_tmp_result, unary_x2_tmp_result);
+    };
+    private:
+    BinaryOp0 binary_op0_{};
+    BinaryOp1 binary_op1_{};
+    UnaryOp0 unary_op0_{};
+    UnaryOp1 unary_op1_{};
+    UnaryOp2 unary_op2_{};
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -73,7 +73,7 @@ template <typename ADataType,
          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
          LoopScheduler LoopSched,
          PipelineVersion PipelineVer = PipelineVersion::v1,
-          typename BComputeDataType   = AComputeDataType_>
+          typename BComputeDataType_  = AComputeDataType_>
 struct GridwiseGemmMultipleD_xdl_cshuffle
 {
    static constexpr index_t NumDTensor = DsDataType::Size();
@@ -103,8 +103,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
 #if CK_WORKAROUND_DENORM_FIX
    using AComputeDataType =
        conditional_t<is_same_v<AComputeDataType_, ck::half_t>, ck::bhalf_t, AComputeDataType_>;
+    using BComputeDataType =
+        conditional_t<is_same_v<BComputeDataType_, ck::half_t>, ck::bhalf_t, BComputeDataType_>;
 #else
    using AComputeDataType = AComputeDataType_;
+    using BComputeDataType = BComputeDataType_;
 #endif
    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp