Merge remote-tracking branch 'origin/develop' into gelu

09ec28be · Chao Liu · b9d3d277 · 85fc91c3 · 09ec28be · 09ec28be
Commit 09ec28be authored May 31, 2022 by Chao Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -417,6 +417,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;

+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1<
        BlockSize,
@@ -477,8 +479,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                 std::vector<ck::index_t> conv_filter_dilations,
                 std::vector<ck::index_t> input_left_pads,
                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op)
@@ -490,8 +490,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
              c_grid_desc_m_n_{},
              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
              in_element_op_{in_element_op},
              wei_element_op_{wei_element_op},
              out_element_op_{out_element_op},
@@ -520,11 +518,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W

            a_grid_desc_k0_m_k1_ = descs[I0];
            b_grid_desc_k0_n_k1_ = descs[I1];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
-
            c_grid_desc_m_n_     = descs[I2];

+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};
+
            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
                                           b_grid_desc_k0_n_k1_,
                                           c_grid_desc_m_n_,
@@ -546,9 +543,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
        typename GridwiseGemm::
            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
+        Block2CTileMap block_2_ctile_map_;
        InElementwiseOperation in_element_op_;
        WeiElementwiseOperation wei_element_op_;
        OutElementwiseOperation out_element_op_;
@@ -661,7 +656,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    true>;

                ave_time = launch_and_time_kernel(
@@ -695,7 +690,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    false>;

                ave_time = launch_and_time_kernel(
@@ -814,8 +809,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                        conv_filter_dilations,
                        input_left_pads,
                        input_right_pads,
-                        1,
-                        1,
                        in_element_op,
                        wei_element_op,
                        out_element_op};
@@ -854,8 +847,6 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                                          conv_filter_dilations,
                                          input_left_pads,
                                          input_right_pads,
-                                          1,
-                                          1,
                                          in_element_op,
                                          wei_element_op,
                                          out_element_op);

--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
-#ifndef DEVICE_CONVND_FWD_XDL_NHWC_KYXC_NHWK_HPP
-#define DEVICE_CONVND_FWD_XDL_NHWC_KYXC_NHWK_HPP
+#pragma once

 #include <functional>
 #include <iostream>
@@ -8,6 +7,7 @@
 #include <sstream>

 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_conv_fwd.hpp"
 #include "convolution_forward_specialization.hpp"
@@ -607,6 +607,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;

+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
    // GridwiseGemm
    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
        BlockSize,
@@ -664,8 +666,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                 std::vector<ck::index_t> conv_filter_dilations,
                 std::vector<ck::index_t> input_left_pads,
                 std::vector<ck::index_t> input_right_pads,
-                 ck::index_t M01,
-                 ck::index_t N01,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op)
@@ -677,8 +677,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
              c_grid_desc_m_n_{},
              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
              in_element_op_{in_element_op},
              wei_element_op_{wei_element_op},
              out_element_op_{out_element_op},
@@ -705,8 +703,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            a_grid_desc_k0_m_k1_ = descs[I0];
            b_grid_desc_k0_n_k1_ = descs[I1];
            c_grid_desc_m_n_     = descs[I2];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};

            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
                                           b_grid_desc_k0_n_k1_,
@@ -727,9 +725,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        CGridDesc_M_N c_grid_desc_m_n_;
        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
-        index_t M01_;
-        index_t N01_;
+        Block2CTileMap block_2_ctile_map_;
        InElementwiseOperation in_element_op_;
        WeiElementwiseOperation wei_element_op_;
        OutElementwiseOperation out_element_op_;
@@ -793,7 +789,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    true>;

                ave_time = launch_and_time_kernel(stream_config,
@@ -824,7 +820,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                    InElementwiseOperation,
                    WeiElementwiseOperation,
                    OutElementwiseOperation,
-                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    Block2CTileMap,
                    false>;

                ave_time = launch_and_time_kernel(stream_config,
@@ -862,6 +858,27 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K

    static bool IsSupportedArgument(const Argument& arg)
    {
+        if(ck::get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(ck::get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
        // Input tensors can't be bigger than 2GB each.
        constexpr ck::long_index_t GB2 = (ck::long_index_t{1} << 31);

@@ -955,8 +972,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                        conv_filter_dilations,
                        input_left_pads,
                        input_right_pads,
-                        1,
-                        1,
                        in_element_op,
                        wei_element_op,
                        out_element_op};
@@ -995,8 +1010,6 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                                          conv_filter_dilations,
                                          input_left_pads,
                                          input_right_pads,
-                                          1,
-                                          1,
                                          in_element_op,
                                          wei_element_op,
                                          out_element_op);
@@ -1012,8 +1025,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        auto str = std::stringstream();

        // clang-format off
-        str << "DeviceConv" << std::to_string(NumDimSpatial)
-            << "DFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
            << "<"
            << BlockSize << ", "
            << MPerBlock << ", "
@@ -1030,4 +1042,3 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
@@ -4,6 +4,7 @@
 #include <sstream>

 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_gemm.hpp"
 #include "common_header.hpp"
@@ -13,7 +14,6 @@
 #include "gemm_specialization.hpp"
 #include "element_wise_operation.hpp"
 #include "gridwise_gemm_dl_v1r3.hpp"
-#include "device_prop.hpp"

 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -11,7 +11,7 @@ template <typename DPtrsGlobal,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation>
+          typename DxsAccElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
@@ -29,7 +29,7 @@ struct DeviceGemmReduce : public BaseOperator
                        BElementwiseOperation b_element_op,
                        CElementwiseOperation c_element_op,
                        DxsInElementwiseOperation dxs_in_element_op,
-                        DxsOutElementwiseOperation dxs_out_element_op,
+                        DxsAccElementwiseOperation dxs_out_element_op,
                        ck::index_t BatchCount = 1) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
@@ -40,13 +40,13 @@ template <typename DPtrsGlobal,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation>
+          typename DxsAccElementwiseOperation>
 using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
                                                             DxsInElementwiseOperation,
-                                                             DxsOutElementwiseOperation>>;
+                                                             DxsAccElementwiseOperation>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -32,7 +32,7 @@ template <typename ALayout,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
@@ -73,7 +73,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
                                                               DxsInElementwiseOperation,
-                                                               DxsOutElementwiseOperation>
+                                                               DxsAccElementwiseOperation>
 {
    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;

@@ -389,7 +389,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        CElementwiseOperation,
        DxsReduceOperation,
        DxsInElementwiseOperation,
-        DxsOutElementwiseOperation,
+        DxsAccElementwiseOperation,
        InMemoryDataOperationEnum::Set,
        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
@@ -449,7 +449,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
                 DxsInElementwiseOperation dxs_in_element_op,
-                 DxsOutElementwiseOperation dxs_out_element_op)
+                 DxsAccElementwiseOperation dxs_out_element_op)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
@@ -498,7 +498,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
        DxsInElementwiseOperation dxs_in_element_op_;
-        DxsOutElementwiseOperation dxs_out_element_op_;
+        DxsAccElementwiseOperation dxs_out_element_op_;
    };

    // Invoker
@@ -554,7 +554,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsOutElementwiseOperation,
+                    DxsAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -594,7 +594,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsOutElementwiseOperation,
+                    DxsAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -669,7 +669,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
                             DxsInElementwiseOperation dxs_in_element_op,
-                             DxsOutElementwiseOperation dxs_out_element_op)
+                             DxsAccElementwiseOperation dxs_out_element_op)
    {
        return Argument{p_a,
                        p_b,
@@ -705,7 +705,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
                                                      DxsInElementwiseOperation dxs_in_element_op,
-                                                      DxsOutElementwiseOperation dxs_out_element_op,
+                                                      DxsAccElementwiseOperation dxs_out_element_op,
                                                      index_t /* KBatch */ = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <sstream>
 #include "device.hpp"
+#include "device_prop.hpp"
 #include "device_base.hpp"
 #include "device_gemm.hpp"
 #include "common_header.hpp"
@@ -11,7 +12,6 @@
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_v2r3.hpp"
 #include "gemm_specialization.hpp"
-#include "device_prop.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -408,7 +408,23 @@ struct DeviceGemmXdl

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(ck::get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(ck::get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -169,6 +169,24 @@ struct AddHardswishAdd
    }
 };

+struct Normalize
+{
+    Normalize(float epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    __host__ __device__ constexpr void operator()(float& y,
+                                                  const float& x,
+                                                  const float& mean,
+                                                  const float& mean_square,
+                                                  const float& gamma,
+                                                  const float& beta) const
+    {
+        float variance = mean_square - (mean * mean);
+        y              = ((x - mean) / sqrtf(variance + epsilon_)) * gamma + beta;
+    }
+
+    float epsilon_;
+};
+
 // Unary operators are usually called element-wisely before/after the reduction is executed on the
 // elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2


--- a/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+#pragma once
+
+#include "cluster_descriptor.hpp"
+#include "data_type.hpp"
+#include "element_wise_operation.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename Gridwise5AryEltwise,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename DDataType,
+          typename EDataType,
+          typename FDataType,
+          typename AGridDesc_M,
+          typename BGridDesc_M,
+          typename CGridDesc_M,
+          typename DGridDesc_M,
+          typename EGridDesc_M,
+          typename FGridDesc_M,
+          typename ElementwiseFunctor>
+__global__ void kernel_5ary_elementwise_1d(const ADataType* __restrict__ p_a_global,
+                                           const BDataType* __restrict__ p_b_global,
+                                           const CDataType* __restrict__ p_c_global,
+                                           const DDataType* __restrict__ p_d_global,
+                                           const EDataType* __restrict__ p_e_global,
+                                           FDataType* __restrict__ p_f_global,
+                                           const AGridDesc_M a_grid_desc_m,
+                                           const BGridDesc_M b_grid_desc_m,
+                                           const CGridDesc_M c_grid_desc_m,
+                                           const DGridDesc_M d_grid_desc_m,
+                                           const EGridDesc_M e_grid_desc_m,
+                                           const FGridDesc_M f_grid_desc_m,
+                                           const ElementwiseFunctor functor)
+{
+    Gridwise5AryEltwise::Run(p_a_global,
+                             p_b_global,
+                             p_c_global,
+                             p_d_global,
+                             p_e_global,
+                             p_f_global,
+                             a_grid_desc_m,
+                             b_grid_desc_m,
+                             c_grid_desc_m,
+                             d_grid_desc_m,
+                             e_grid_desc_m,
+                             f_grid_desc_m,
+                             functor);
+}
+
+// TODO - implement n-ary Elemenetwise_1D, tuple of inputs and tuple of outputs
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename DDataType,
+          typename EDataType,
+          typename FDataType,
+          typename ComputeDataType,
+          typename AGridDesc_M,
+          typename BGridDesc_M,
+          typename CGridDesc_M,
+          typename DGridDesc_M,
+          typename EGridDesc_M,
+          typename FGridDesc_M,
+          typename ElementwiseFunctor,
+          index_t MPerThread,
+          index_t AScalarPerVector,
+          index_t BScalarPerVector,
+          index_t CScalarPerVector,
+          index_t DScalarPerVector,
+          index_t EScalarPerVector,
+          index_t FScalarPerVector>
+struct Gridwise5AryElementwise_1D
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto thread_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
+
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    static __device__ auto CalculateElementwiseIndex()
+    {
+        const index_t global_thread_id = get_thread_global_1d_id();
+        return make_multi_index(global_thread_id * MPerThread);
+    }
+
+    __device__ static void Run(const ADataType* __restrict__ p_a_global,
+                               const BDataType* __restrict__ p_b_global,
+                               const CDataType* __restrict__ p_c_global,
+                               const DDataType* __restrict__ p_d_global,
+                               const EDataType* __restrict__ p_e_global,
+                               FDataType* __restrict__ p_f_global,
+                               const AGridDesc_M a_grid_desc_m,
+                               const BGridDesc_M b_grid_desc_m,
+                               const CGridDesc_M c_grid_desc_m,
+                               const DGridDesc_M d_grid_desc_m,
+                               const EGridDesc_M e_grid_desc_m,
+                               const FGridDesc_M f_grid_desc_m,
+                               const ElementwiseFunctor functor)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_grid_desc_m.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_grid_desc_m.GetElementSpaceSize());
+        const auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_grid_desc_m.GetElementSpaceSize());
+        const auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_d_global, d_grid_desc_m.GetElementSpaceSize());
+        const auto e_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_global, e_grid_desc_m.GetElementSpaceSize());
+        auto f_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_f_global, f_grid_desc_m.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> c_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> d_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> e_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> f_thread_buf;
+
+        const auto thread_store_global_offset = CalculateElementwiseIndex();
+
+        auto a_global_load =
+            ThreadwiseTensorSliceTransfer_v2<ADataType,
+                                             ComputeDataType,
+                                             AGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
+                                             Sequence<0>,          // DimAccessOrder
+                                             0,                    // SrcVectorDim
+                                             AScalarPerVector,     // ScalarPerVector
+                                             1,                    // SrcScalarStrideInVector
+                                             false>{a_grid_desc_m, thread_store_global_offset};
+
+        auto b_global_load =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             ComputeDataType,
+                                             BGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
+                                             Sequence<0>,          // DimAccessOrder
+                                             0,                    // SrcVectorDim
+                                             BScalarPerVector,     // ScalarPerVector
+                                             1,                    // SrcScalarStrideInVector
+                                             false>{b_grid_desc_m, thread_store_global_offset};
+
+        auto c_global_load =
+            ThreadwiseTensorSliceTransfer_v2<CDataType,
+                                             ComputeDataType,
+                                             CGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
+                                             Sequence<0>,          // DimAccessOrder
+                                             0,                    // SrcVectorDim
+                                             CScalarPerVector,     // ScalarPerVector
+                                             1,                    // SrcScalarStrideInVector
+                                             false>{c_grid_desc_m, thread_store_global_offset};
+
+        auto d_global_load =
+            ThreadwiseTensorSliceTransfer_v2<DDataType,
+                                             ComputeDataType,
+                                             DGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
+                                             Sequence<0>,          // DimAccessOrder
+                                             0,                    // SrcVectorDim
+                                             DScalarPerVector,     // ScalarPerVector
+                                             1,                    // SrcScalarStrideInVector
+                                             false>{d_grid_desc_m, thread_store_global_offset};
+
+        auto e_global_load =
+            ThreadwiseTensorSliceTransfer_v2<EDataType,
+                                             ComputeDataType,
+                                             EGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
+                                             Sequence<0>,          // DimAccessOrder
+                                             0,                    // SrcVectorDim
+                                             EScalarPerVector,     // ScalarPerVector
+                                             1,                    // SrcScalarStrideInVector
+                                             false>{e_grid_desc_m, thread_store_global_offset};
+
+        auto f_global_write =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               FDataType,
+                                               decltype(thread_desc_m),
+                                               FGridDesc_M,
+                                               PassThrough,
+                                               Sequence<MPerThread>, // SliceLengths
+                                               Sequence<0>,          // DimAccessOrder
+                                               0,                    // DstVectorDim
+                                               FScalarPerVector,     // ScalarPerVector
+                                               InMemoryDataOperationEnum::Set,
+                                               1, // DstScalarStrideInVector
+                                               false>{
+                f_grid_desc_m, thread_store_global_offset, PassThrough{}};
+
+        const index_t blockSize    = get_block_size();
+        const index_t blockPerGrid = get_grid_size();
+        const auto M               = c_grid_desc_m.GetLength(I0);
+        const index_t loop_step    = blockPerGrid * blockSize * MPerThread;
+        const auto loop_step_index = make_multi_index(loop_step);
+
+        index_t num_iter = M / (loop_step);
+        do
+        {
+            // read and process MPerThread elements
+            a_global_load.Run(
+                a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf);
+
+            b_global_load.Run(
+                b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf);
+
+            c_global_load.Run(
+                c_grid_desc_m, c_global_buf, thread_desc_m, make_tuple(I0), c_thread_buf);
+
+            d_global_load.Run(
+                d_grid_desc_m, d_global_buf, thread_desc_m, make_tuple(I0), d_thread_buf);
+
+            e_global_load.Run(
+                e_grid_desc_m, e_global_buf, thread_desc_m, make_tuple(I0), e_thread_buf);
+
+            static_for<0, MPerThread, 1>{}([&](auto m) {
+                constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m));
+                functor(f_thread_buf(Number<offset>{}),
+                        a_thread_buf(Number<offset>{}),
+                        b_thread_buf(Number<offset>{}),
+                        c_thread_buf(Number<offset>{}),
+                        d_thread_buf(Number<offset>{}),
+                        e_thread_buf(Number<offset>{}));
+            });
+
+            f_global_write.Run(thread_desc_m,
+                               make_tuple(I0), // SrcSliceOriginIdx
+                               f_thread_buf,
+                               f_grid_desc_m,
+                               f_global_buf);
+
+            a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index);
+            b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index);
+            c_global_load.MoveSrcSliceWindow(c_grid_desc_m, loop_step_index);
+            d_global_load.MoveSrcSliceWindow(d_grid_desc_m, loop_step_index);
+            e_global_load.MoveSrcSliceWindow(e_grid_desc_m, loop_step_index);
+            f_global_write.MoveDstSliceWindow(f_grid_desc_m, loop_step_index);
+        } while(--num_iter);
+    }
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
@@ -11,138 +11,140 @@ template <typename GridwiseBinEltwise,
          typename ADataType,
          typename BDataType,
          typename CDataType,
-          typename GridDesc_M0,
+          typename AGridDesc_M,
+          typename BGridDesc_M,
+          typename CGridDesc_M,
          typename ElementwiseFunctor>
 __global__ void kernel_binary_elementwise_1d(const ADataType* __restrict__ p_a_global,
                                             const BDataType* __restrict__ p_b_global,
                                             CDataType* __restrict__ p_c_global,
-                                             const GridDesc_M0 a_grid_desc_m0,
-                                             const GridDesc_M0 b_grid_desc_m0,
-                                             const GridDesc_M0 c_grid_desc_m0,
+                                             const AGridDesc_M a_grid_desc_m,
+                                             const BGridDesc_M b_grid_desc_m,
+                                             const CGridDesc_M c_grid_desc_m,
                                             const ElementwiseFunctor functor)
 {
-    GridwiseBinEltwise::Run(p_a_global,
-                            p_b_global,
-                            p_c_global,
-                            a_grid_desc_m0,
-                            b_grid_desc_m0,
-                            c_grid_desc_m0,
-                            functor);
+    GridwiseBinEltwise::Run(
+        p_a_global, p_b_global, p_c_global, a_grid_desc_m, b_grid_desc_m, c_grid_desc_m, functor);
 }

 template <typename ADataType,
          typename BDataType,
          typename CDataType,
          typename ComputeDataType,
-          typename GridDesc_M0,
+          typename AGridDesc_M,
+          typename BGridDesc_M,
+          typename CGridDesc_M,
          typename ElementwiseFunctor,
-          index_t ScalarPerVector>
+          index_t MPerThread,
+          index_t AScalarPerVector,
+          index_t BScalarPerVector,
+          index_t CScalarPerVector>
 struct GridwiseBinaryElementwise_1D
 {
    static constexpr auto I0 = Number<0>{};
-    static constexpr auto thread_desc_m0 =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
+    static constexpr auto thread_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));

    using PassThrough = tensor_operation::element_wise::PassThrough;

    static __device__ auto CalculateElementwiseIndex()
    {
        const index_t global_thread_id = get_thread_global_1d_id();
-        return make_multi_index(global_thread_id * ScalarPerVector);
+        return make_multi_index(global_thread_id * MPerThread);
    }

    __device__ static void Run(const ADataType* __restrict__ p_a_global,
                               const BDataType* __restrict__ p_b_global,
                               CDataType* __restrict__ p_c_global,
-                               const GridDesc_M0 a_grid_desc_m0,
-                               const GridDesc_M0 b_grid_desc_m0,
-                               const GridDesc_M0 c_grid_desc_m0,
+                               const AGridDesc_M a_grid_desc_m,
+                               const BGridDesc_M b_grid_desc_m,
+                               const CGridDesc_M c_grid_desc_m,
                               const ElementwiseFunctor functor)
    {
        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_grid_desc_m0.GetElementSpaceSize());
+            p_a_global, a_grid_desc_m.GetElementSpaceSize());
        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_grid_desc_m0.GetElementSpaceSize());
+            p_b_global, b_grid_desc_m.GetElementSpaceSize());
        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_grid_desc_m0.GetElementSpaceSize());
+            p_c_global, c_grid_desc_m.GetElementSpaceSize());

-        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> a_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> b_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> c_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MPerThread, true> c_thread_buf;

        const auto thread_store_global_offset = CalculateElementwiseIndex();

        auto a_global_load =
            ThreadwiseTensorSliceTransfer_v2<ADataType,
                                             ComputeDataType,
-                                             GridDesc_M0,
-                                             decltype(thread_desc_m0),
-                                             Sequence<ScalarPerVector>, // SliceLengths
+                                             AGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
                                             Sequence<0>,          // DimAccessOrder
                                             0,                    // SrcVectorDim
-                                             ScalarPerVector,
+                                             AScalarPerVector,     // ScalarPerVector
                                             1,                    // SrcScalarStrideInVector
-                                             false>{a_grid_desc_m0, thread_store_global_offset};
+                                             false>{a_grid_desc_m, thread_store_global_offset};

        auto b_global_load =
            ThreadwiseTensorSliceTransfer_v2<BDataType,
                                             ComputeDataType,
-                                             GridDesc_M0,
-                                             decltype(thread_desc_m0),
-                                             Sequence<ScalarPerVector>, // SliceLengths
+                                             BGridDesc_M,
+                                             decltype(thread_desc_m),
+                                             Sequence<MPerThread>, // SliceLengths
                                             Sequence<0>,          // DimAccessOrder
                                             0,                    // SrcVectorDim
-                                             ScalarPerVector,
+                                             BScalarPerVector,     // ScalarPerVector
                                             1,                    // SrcScalarStrideInVector
-                                             false>{b_grid_desc_m0, thread_store_global_offset};
+                                             false>{b_grid_desc_m, thread_store_global_offset};

        auto c_global_write =
            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
                                               CDataType,
-                                               decltype(thread_desc_m0),
-                                               GridDesc_M0,
+                                               decltype(thread_desc_m),
+                                               CGridDesc_M,
                                               PassThrough,
-                                               Sequence<ScalarPerVector>, // SliceLengths
+                                               Sequence<MPerThread>, // SliceLengths
                                               Sequence<0>,          // DimAccessOrder
                                               0,                    // DstVectorDim
-                                               ScalarPerVector,
+                                               CScalarPerVector,     // ScalarPerVector
                                               InMemoryDataOperationEnum::Set,
                                               1, // DstScalarStrideInVector
                                               false>{
-                c_grid_desc_m0, thread_store_global_offset, PassThrough{}};
+                c_grid_desc_m, thread_store_global_offset, PassThrough{}};

        const index_t blockSize    = get_block_size();
        const index_t blockPerGrid = get_grid_size();
-        const auto m0              = c_grid_desc_m0.GetLength(I0);
-        const index_t loop_step    = blockPerGrid * blockSize * ScalarPerVector;
+        const auto M               = c_grid_desc_m.GetLength(I0);
+        const index_t loop_step    = blockPerGrid * blockSize * MPerThread;
        const auto loop_step_index = make_multi_index(loop_step);

-        index_t num_iter = m0 / (loop_step);
+        index_t num_iter = M / (loop_step);
        do
        {
-            // read and process ScalarPerVector elements
+            // read and process MPerThread elements
            a_global_load.Run(
-                a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf);
+                a_grid_desc_m, a_global_buf, thread_desc_m, make_tuple(I0), a_thread_buf);

            b_global_load.Run(
-                b_grid_desc_m0, b_global_buf, thread_desc_m0, make_tuple(I0), b_thread_buf);
+                b_grid_desc_m, b_global_buf, thread_desc_m, make_tuple(I0), b_thread_buf);

-            static_for<0, ScalarPerVector, 1>{}([&](auto m) {
-                constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
+            static_for<0, MPerThread, 1>{}([&](auto m) {
+                constexpr auto offset = thread_desc_m.CalculateOffset(make_tuple(m));
                functor(c_thread_buf(Number<offset>{}),
                        a_thread_buf(Number<offset>{}),
                        b_thread_buf(Number<offset>{}));
            });

-            c_global_write.Run(thread_desc_m0,
+            c_global_write.Run(thread_desc_m,
                               make_tuple(I0), // SrcSliceOriginIdx
                               c_thread_buf,
-                               c_grid_desc_m0,
+                               c_grid_desc_m,
                               c_global_buf);

-            a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index);
-            b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, loop_step_index);
-            c_global_write.MoveDstSliceWindow(c_grid_desc_m0, loop_step_index);
+            a_global_load.MoveSrcSliceWindow(a_grid_desc_m, loop_step_index);
+            b_global_load.MoveSrcSliceWindow(b_grid_desc_m, loop_step_index);
+            c_global_write.MoveDstSliceWindow(c_grid_desc_m, loop_step_index);
        } while(--num_iter);
    }
 };

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -21,7 +21,7 @@ template <typename GridwiseGemm,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          typename AGridDesc_AK0_M_AK1,
          typename BGridDesc_BK0_N_BK1,
          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -41,7 +41,7 @@ __global__ void
            const BElementwiseOperation b_element_op,
            const CElementwiseOperation c_element_op,
            const DxsInElementwiseOperation dxs_in_element_op,
-            const DxsOutElementwiseOperation dxs_out_element_op,
+            const DxsAccElementwiseOperation dxs_out_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -96,7 +96,7 @@ template <typename FloatAB,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsOutElementwiseOperation,
+          typename DxsAccElementwiseOperation,
          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
          typename DGlobalMemoryDataOperation,
          typename AGridDesc_AK0_M_AK1,
@@ -329,7 +329,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                               const BElementwiseOperation& b_element_op,
                               const CElementwiseOperation& c_element_op,
                               const DxsInElementwiseOperation& dxs_in_element_op,
-                               const DxsOutElementwiseOperation& dxs_out_element_op,
+                               const DxsAccElementwiseOperation& dxs_out_element_op,
                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -314,7 +314,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
    using DefaultBlock2CTileMap =
        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;

-    template <bool HasMainK0BlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    template <bool HasMainK0BlockLoop, typename Block2CTileMap>
    __device__ static void
    Run(const FloatAB* __restrict__ p_a_grid,
        const FloatAB* __restrict__ p_b_grid,

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -25,6 +25,7 @@ enum struct MfmaInstr
    mfma_f32_16x16x8bf16,
    mfma_i32_32x32x8i8,
    mfma_i32_16x16x16i8,
+    mfma_f64_16x16x4f64
 };

 template <MfmaInstr instr>
@@ -383,12 +384,40 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
    }
 };

+template <>
+struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
+{
+    static constexpr index_t group_size          = 1;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 4; // group_size * num_groups_per_blk;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4; // wave_size / num_threads_per_blk;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f64_16x16x4f64<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
 template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
 struct MfmaSelector
 {
    template <typename base_type_, index_t MPerXdlops_, index_t NPerXdlops_>
    static constexpr auto GetMfma();

+    template <>
+    static constexpr auto GetMfma<double, 16, 16>()
+    {
+        return MfmaInstr::mfma_f64_16x16x4f64;
+    }
+
    template <>
    static constexpr auto GetMfma<float, 64, 64>()
    {
@@ -661,9 +690,10 @@ struct XdlopsGemm
    template <class FloatA, class FloatB, class FloatC>
    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
    {
-        static_assert(is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
-                          is_same<base_type, bhalf_t>::value || is_same<base_type, int8_t>::value,
-                      "base base_type must be float, half, bfloat16, and int8_t!");
+        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
+                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+                          is_same<base_type, int8_t>::value,
+                      "base base_type must be double, float, half, bfloat16, and int8_t!");

        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
            mfma_instr.template run<MPerXdlops, NPerXdlops>(p_a_wave[k], p_b_wave[k], p_c_thread);

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -294,5 +294,24 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
    }
 };

+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f64_16x16x4f64;
+
+template <>
+struct intrin_mfma_f64_16x16x4f64<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
+    {
+#ifdef __gfx90a__
+        reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
+            reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
 } // namespace ck
 #endif
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -11,6 +11,7 @@ namespace host {
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
+          typename AccDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
@@ -53,20 +54,20 @@ struct ReferenceGemm : public device::BaseOperator
            auto f_mk_kn_mn = [&](auto m, auto n) {
                const int K = arg.a_m_k_.mDesc.GetLengths()[1];

-                float v_acc = 0;
+                AccDataType v_acc = 0;

                for(int k = 0; k < K; ++k)
                {
-                    float v_a;
-                    float v_b;
+                    AccDataType v_a;
+                    AccDataType v_b;

-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
+                    arg.a_element_op_(v_a, static_cast<const AccDataType>(arg.a_m_k_(m, k)));
+                    arg.b_element_op_(v_b, static_cast<const AccDataType>(arg.b_k_n_(k, n)));

                    v_acc += v_a * v_b;
                }

-                float v_c;
+                AccDataType v_c;

                arg.c_element_op_(v_c, v_acc);


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -38,7 +38,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances =
    std::tuple<
        // clang-format off
-        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|    DxsInEleOp|    DxsOutEleOp|             D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|    DxsInEleOp|    DxsAccEleOp|             D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|              |               |    MemoryData|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //##################################|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|              |               |     Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //##################################|        |        |        |     |      |      |         |         |          |              |            |            |            |            |              |               |              |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -38,7 +38,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances =
    std::tuple<
        // clang-format off
-        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|       Dxs|    DxsInEleOp|    DxsOutEleOp|            D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|       Dxs|    DxsInEleOp|    DxsAccEleOp|            D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|    Reduce|              |               |   MemoryData|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //##################################|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation| Operation|              |               |    Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //##################################|        |        |        |     |      |      |         |         |          |              |            |            |            |          |              |               |             |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -38,7 +38,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances =
    std::tuple<
        // clang-format off
-        //##################################| ALayout| BLayout| CLayout| AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|    DxsInEleOp|    DxsOutEleOp|           D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################| ALayout| BLayout| CLayout| AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|    DxsInEleOp|    DxsAccEleOp|           D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //##################################|        |        |        |  Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|              |               |  MemoryData|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //##################################|        |        |        |      |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|              |               |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //##################################|        |        |        |      |      |      |         |         |          |              |            |            |            |            |              |               |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -38,7 +38,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances =
    std::tuple<
        // clang-format off
-        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|       Dxs|    DxsInEleOp|    DxsOutEleOp|           D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|       Dxs|    DxsInEleOp|    DxsAccEleOp|           D|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|    Reduce|              |               |  MemoryData|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
        //##################################|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation| Operation|              |               |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
        //##################################|        |        |        |     |      |      |         |         |          |              |            |            |            |          |              |               |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |

--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
 set(DEVICE_GEMM_INSTANCE_SOURCE
+   device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp;
   device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp;
   device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp;
   device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp;

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_kn_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck