merge develop

b5ada11b · Jing Zhang · cee92951 · b6eaf3eb · b5ada11b · b5ada11b
Commit b5ada11b authored Jun 01, 2022 by Jing Zhang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -417,6 +417,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;

+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1<
        BlockSize,
@@ -477,8 +479,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                 std::vector<ck::index_t> conv_filter_dilations,
                 std::vector<ck::index_t> input_left_pads,
                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op)
@@ -490,8 +490,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
              c_grid_desc_m_n_{},
              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
              in_element_op_{in_element_op},
              wei_element_op_{wei_element_op},
              out_element_op_{out_element_op},
@@ -520,10 +518,9 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W

            a_grid_desc_k0_m_k1_ = descs[I0];
            b_grid_desc_k0_n_k1_ = descs[I1];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+            c_grid_desc_m_n_     = descs[I2];

-            c_grid_desc_m_n_ = descs[I2];
+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};

            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
                                           b_grid_desc_k0_n_k1_,
@@ -546,9 +543,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
        typename GridwiseGemm::
            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
+        Block2CTileMap block_2_ctile_map_;
        InElementwiseOperation in_element_op_;
        WeiElementwiseOperation wei_element_op_;
        OutElementwiseOperation out_element_op_;
@@ -661,7 +656,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    true>;

                ave_time = launch_and_time_kernel(
@@ -695,7 +690,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    false>;

                ave_time = launch_and_time_kernel(
@@ -814,8 +809,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                        conv_filter_dilations,
                        input_left_pads,
                        input_right_pads,
-                        1,
-                        1,
                        in_element_op,
                        wei_element_op,
                        out_element_op};
@@ -854,8 +847,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                                          conv_filter_dilations,
                                          input_left_pads,
                                          input_right_pads,
-                                          1,
-                                          1,
                                          in_element_op,
                                          wei_element_op,
                                          out_element_op);

--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
-#ifndef DEVICE_CONVND_FWD_XDL_NHWC_KYXC_NHWK_HPP
-#define DEVICE_CONVND_FWD_XDL_NHWC_KYXC_NHWK_HPP
+#pragma once

 #include <functional>
 #include <iostream>
@@ -8,6 +7,7 @@
 #include <sstream>

 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_conv_fwd.hpp"
 #include "convolution_forward_specialization.hpp"
@@ -607,6 +607,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;

+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
        BlockSize,
@@ -664,8 +666,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                 std::vector<ck::index_t> conv_filter_dilations,
                 std::vector<ck::index_t> input_left_pads,
                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op)
@@ -677,8 +677,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
              c_grid_desc_m_n_{},
              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
              in_element_op_{in_element_op},
              wei_element_op_{wei_element_op},
              out_element_op_{out_element_op},
@@ -705,8 +703,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            a_grid_desc_k0_m_k1_ = descs[I0];
            b_grid_desc_k0_n_k1_ = descs[I1];
            c_grid_desc_m_n_     = descs[I2];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};

            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
                                           b_grid_desc_k0_n_k1_,
@@ -727,9 +725,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        CGridDesc_M_N c_grid_desc_m_n_;
        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
+        Block2CTileMap block_2_ctile_map_;
        InElementwiseOperation in_element_op_;
        WeiElementwiseOperation wei_element_op_;
        OutElementwiseOperation out_element_op_;
@@ -793,7 +789,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    true>;

                ave_time = launch_and_time_kernel(stream_config,
@@ -824,7 +820,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    false>;

                ave_time = launch_and_time_kernel(stream_config,
@@ -862,6 +858,27 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K

    static bool IsSupportedArgument(const Argument& arg)
    {
+        if(ck::get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(ck::get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
        // Input tensors can't be bigger than 2GB each.
        constexpr ck::long_index_t GB2 = (ck::long_index_t{1} << 31);

@@ -955,8 +972,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                        conv_filter_dilations,
                        input_left_pads,
                        input_right_pads,
-                        1,
-                        1,
                        in_element_op,
                        wei_element_op,
                        out_element_op};
@@ -995,8 +1010,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                          conv_filter_dilations,
                                          input_left_pads,
                                          input_right_pads,
-                                          1,
-                                          1,
                                          in_element_op,
                                          wei_element_op,
                                          out_element_op);
@@ -1012,8 +1025,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        auto str = std::stringstream();

        // clang-format off
-        str << "DeviceConv" << std::to_string(NumDimSpatial)
-            << "DFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
            << "<"
            << BlockSize << ", "
            << MPerBlock << ", "
@@ -1030,4 +1042,3 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
@@ -4,6 +4,7 @@
 #include <sstream>

 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_gemm.hpp"
 #include "common_header.hpp"
@@ -13,7 +14,6 @@
 #include "gemm_specialization.hpp"
 #include "element_wise_operation.hpp"
 #include "gridwise_gemm_dl_v1r3.hpp"
-#include "device_prop.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -60,8 +60,8 @@ template <
    index_t CThreadTransferDstScalarPerVector,
    enable_if_t<
        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
        bool> = false>
 struct DeviceGemmDl
    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -11,7 +11,7 @@ template <typename DPtrsGlobal,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation>
+          typename DxsAccElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
@@ -29,7 +29,7 @@ struct DeviceGemmReduce : public BaseOperator
                        BElementwiseOperation b_element_op,
                        CElementwiseOperation c_element_op,
                        DxsInElementwiseOperation dxs_in_element_op,
-                        DxsOutElementwiseOperation dxs_out_element_op,
+                        DxsAccElementwiseOperation dxs_out_element_op,
                        ck::index_t BatchCount = 1) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
@@ -40,13 +40,13 @@ template <typename DPtrsGlobal,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation>
+          typename DxsAccElementwiseOperation>
 using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
                                                             DxsInElementwiseOperation,
-                                                             DxsOutElementwiseOperation>>;
+                                                             DxsAccElementwiseOperation>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -32,7 +32,7 @@ template <typename ALayout,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
@@ -73,7 +73,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
                                                               DxsInElementwiseOperation,
-                                                               DxsOutElementwiseOperation>
+                                                               DxsAccElementwiseOperation>
 {
    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;

@@ -389,7 +389,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        CElementwiseOperation,
        DxsReduceOperation,
        DxsInElementwiseOperation,
-        DxsOutElementwiseOperation,
+        DxsAccElementwiseOperation,
        InMemoryDataOperationEnum::Set,
        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
@@ -449,7 +449,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
                 DxsInElementwiseOperation dxs_in_element_op,
-                 DxsOutElementwiseOperation dxs_out_element_op)
+                 DxsAccElementwiseOperation dxs_out_element_op)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
@@ -498,7 +498,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
        DxsInElementwiseOperation dxs_in_element_op_;
-        DxsOutElementwiseOperation dxs_out_element_op_;
+        DxsAccElementwiseOperation dxs_out_element_op_;
    };

    // Invoker
@@ -554,7 +554,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsOutElementwiseOperation,
+                    DxsAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -594,7 +594,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsOutElementwiseOperation,
+                    DxsAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -669,7 +669,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
                             DxsInElementwiseOperation dxs_in_element_op,
-                             DxsOutElementwiseOperation dxs_out_element_op)
+                             DxsAccElementwiseOperation dxs_out_element_op)
    {
        return Argument{p_a,
                        p_b,
@@ -705,7 +705,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
                                                      DxsInElementwiseOperation dxs_in_element_op,
-                                                      DxsOutElementwiseOperation dxs_out_element_op,
+                                                      DxsAccElementwiseOperation dxs_out_element_op,
                                                      index_t /* KBatch */ = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <sstream>
 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_gemm.hpp"
 #include "common_header.hpp"
@@ -11,7 +12,6 @@
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_v2r3.hpp"
 #include "gemm_specialization.hpp"
-#include "device_prop.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -408,7 +408,23 @@ struct DeviceGemmXdl

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(ck::get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(ck::get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -65,7 +65,7 @@ __global__ void
        c_element_op,
        gemm_desc_ptr[group_id].grouped_gemm_block_2_ctile_map_);
 #else
-    ignore = gemm_descs;
+    ignore = gemm_descs_const;
    ignore = group_count;
    ignore = a_element_op;
    ignore = b_element_op;
@@ -320,7 +320,6 @@ struct DeviceGroupedGemmXdl
            return block_2_ctile_map_.CheckValidity(c_grid_desc_m_n);
        }

-        private:
        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
        ck::index_t BlockStart_;
    };
@@ -394,9 +393,8 @@ struct DeviceGroupedGemmXdl
                    DeviceGroupedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);

                const index_t grid_size_grp =
-                    typename GroupedGemmBlock2CTileMap::UnderlyingBlock2CTileMap(
-                        c_grid_desc_m_n_, M01, N01)
-                        .CalculateGridSize(c_grid_desc_m_n_);
+                    GroupedGemmBlock2CTileMap(c_grid_desc_m_n_, M01, N01, 0)
+                        .block_2_ctile_map_.CalculateGridSize(c_grid_desc_m_n_);

                const index_t BlockStart = grid_size_;
                const index_t BlockEnd   = grid_size_ + grid_size_grp;

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
 #pragma once
 #include "data_type.hpp"

@@ -5,14 +30,22 @@ namespace ck {
 namespace tensor_operation {
 namespace binary_element_wise {

-struct Add
+template <typename Y, typename X1, typename X2>
+struct Add;
+
+template <>
+struct Add<double, double, double>
 {
    __host__ __device__ constexpr void
    operator()(double& dst, const double& src1, const double& src2) const
    {
        dst = src1 + src2;
    }
+};

+template <>
+struct Add<float, float, float>
+{
    __host__ __device__ constexpr void
    operator()(float& dst, const float& src1, const float& src2) const
    {
@@ -20,6 +53,75 @@ struct Add
    }
 };

+template <>
+struct Add<half_t, half_t, half_t>
+{
+    __host__ __device__ constexpr void
+    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    {
+        dst = src1 + src2;
+    }
+};
+
+template <>
+struct Add<bhalf_t, bhalf_t, bhalf_t>
+{
+    __host__ __device__ constexpr void
+    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    {
+        const float x1 = ck::type_convert<float>(src1);
+        const float x2 = ck::type_convert<float>(src2);
+        const float y  = x1 + x2;
+        dst            = ck::type_convert<bhalf_t>(y);
+    }
+};
+
+template <typename Y, typename X1, typename X2>
+struct Substract;
+
+template <>
+struct Substract<double, double, double>
+{
+    __host__ __device__ constexpr void
+    operator()(double& dst, const double& src1, const double& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<float, float, float>
+{
+    __host__ __device__ constexpr void
+    operator()(float& dst, const float& src1, const float& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<half_t, half_t, half_t>
+{
+    __host__ __device__ constexpr void
+    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+template <>
+struct Substract<bhalf_t, bhalf_t, bhalf_t>
+{
+    __host__ __device__ constexpr void
+    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    {
+        const float x1 = ck::type_convert<float>(src1);
+        const float x2 = ck::type_convert<float>(src2);
+        const float y  = x1 - x2;
+        dst            = ck::type_convert<bhalf_t>(y);
+    }
+};
+
 } // namespace binary_element_wise
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -143,6 +143,24 @@ struct AddHardswishAdd
    }
 };

+struct Normalize
+{
+    Normalize(float epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    __host__ __device__ constexpr void operator()(float& y,
+                                                  const float& x,
+                                                  const float& mean,
+                                                  const float& mean_square,
+                                                  const float& gamma,
+                                                  const float& beta) const
+    {
+        float variance = mean_square - (mean * mean);
+        y              = ((x - mean) / sqrtf(variance + epsilon_)) * gamma + beta;
+    }
+
+    float epsilon_;
+};
+
 // Unary operators are usually called element-wisely before/after the reduction is executed on the
 // elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2


--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -21,7 +21,7 @@ template <typename GridwiseGemm,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          typename AGridDesc_AK0_M_AK1,
          typename BGridDesc_BK0_N_BK1,
          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -41,7 +41,7 @@ __global__ void
            const BElementwiseOperation b_element_op,
            const CElementwiseOperation c_element_op,
            const DxsInElementwiseOperation dxs_in_element_op,
-            const DxsOutElementwiseOperation dxs_out_element_op,
+            const DxsAccElementwiseOperation dxs_out_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -96,7 +96,7 @@ template <typename FloatAB,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename DGlobalMemoryDataOperation,
          typename AGridDesc_AK0_M_AK1,
@@ -306,7 +306,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    __host__ __device__ static constexpr auto
    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
            c_grid_desc_m_n);
    }

@@ -329,7 +329,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                               const BElementwiseOperation& b_element_op,
                               const CElementwiseOperation& c_element_op,
                               const DxsInElementwiseOperation& dxs_in_element_op,
-                               const DxsOutElementwiseOperation& dxs_out_element_op,
+                               const DxsAccElementwiseOperation& dxs_out_element_op,
                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -259,7 +259,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
    __host__ __device__ static constexpr auto
    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
            c_grid_desc_m_n);
    }


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -288,11 +288,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }

    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -265,10 +265,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4

    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch)
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N00_M01_N01<MPerBlock, NPerBlock, CMNGridDesc>(
-            c_m_n_grid_desc, M01, N01, KBatch);
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
    }

    using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{}));

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -239,10 +239,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2

    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
-        const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch)
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
    {
-        return BlockToCTileMap_KSplit_M00_N00_M01_N01<MPerBlock, NPerBlock, CMNGridDesc>(
-            c_m_n_grid_desc, M01, N01, KBatch);
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
    }

    __host__ __device__ static constexpr auto

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -300,11 +300,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }
    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
        remove_cvref_t<decltype(
@@ -314,7 +314,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
    using DefaultBlock2CTileMap =
        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;

-    template <bool HasMainK0BlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    template <bool HasMainK0BlockLoop, typename Block2CTileMap>
    __device__ static void
    Run(const FloatAB* __restrict__ p_a_grid,
        const FloatAB* __restrict__ p_b_grid,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -309,11 +309,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
    }

    // return block_id to C matrix tile idx (m0, n0) mapping
-    __host__ __device__ static constexpr auto
-    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
    {
-        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
-            c_grid_desc_m_n, M01, N01);
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
    }

    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =