Merge remote-tracking branch 'origin/develop' into batched_gemm_c_permute

0b11569f · Chao Liu · e8d3a0fb · fa9a0a5c · 0b11569f · 0b11569f
Commit 0b11569f authored Jul 01, 2022 by Chao Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/tensor_description/cluster_descriptor.hpp"

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "ck/utility/common_header.hpp"

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CONVOLUTION_BACKWARD_DATA_SPECIALIZATION
 #define CONVOLUTION_BACKWARD_DATA_SPECIALIZATION

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 namespace ck {

--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CONVOLUTION_FORWARD_SPECIALIZATION
 #define CONVOLUTION_FORWARD_SPECIALIZATION

--- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
@@ -7,7 +10,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp"
 #include "ck/device_utility/device_prop.hpp"
 #include "ck/device_utility/kernel_launch.hpp"
@@ -32,7 +35,7 @@ template <typename ADataType,
          index_t DScalarPerVector,
          index_t EScalarPerVector,
          index_t FScalarPerVector>
-struct Device5AryElementwise : public BaseOperator
+struct Device5AryElementwise : public DeviceElementwise<5, 1, NDim, ElementwiseFunctor>
 {
    static constexpr auto I0 = Number<0>{};
@@ -265,12 +268,8 @@ struct Device5AryElementwise : public BaseOperator
        return true;
    };
-    static auto MakeArgument(const ADataType* p_a,
+    static auto MakeArgument(std::array<const void*, 5> p_inputs,
-                             const BDataType* p_b,
+                             std::array<void*, 1> p_outputs,
-                             const CDataType* p_c,
-                             const DDataType* p_d,
-                             const EDataType* p_e,
-                             FDataType* p_f,
                             std::vector<index_t> lengths,
                             std::vector<index_t> a_strides,
                             std::vector<index_t> b_strides,
@@ -280,12 +279,12 @@ struct Device5AryElementwise : public BaseOperator
                             std::vector<index_t> f_strides,
                             ElementwiseFunctor functor)
    {
-        return Argument{p_a,
+        return Argument{static_cast<const ADataType*>(p_inputs[0]),
-                        p_b,
+                        static_cast<const BDataType*>(p_inputs[1]),
-                        p_c,
+                        static_cast<const CDataType*>(p_inputs[2]),
-                        p_d,
+                        static_cast<const DDataType*>(p_inputs[3]),
-                        p_e,
+                        static_cast<const EDataType*>(p_inputs[4]),
-                        p_f,
+                        static_cast<FDataType*>(p_outputs[0]),
                        lengths,
                        a_strides,
                        b_strides,
@@ -296,40 +295,58 @@ struct Device5AryElementwise : public BaseOperator
                        functor};
    }
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    std::unique_ptr<BaseArgument>
-                                                      const void* p_b,
+    MakeArgumentPointer(std::array<const void*, 5> p_inputs,
-                                                      const void* p_c,
+                        std::array<void*, 1> p_outputs,
-                                                      const void* p_d,
+                        std::vector<index_t> lengths,
-                                                      const void* p_e,
+                        std::vector<std::vector<index_t>> input_strides,
-                                                      void* p_f,
+                        std::vector<std::vector<index_t>> output_strides,
-                                                      std::vector<index_t> lengths,
+                        ElementwiseFunctor functor) override
-                                                      std::vector<index_t> a_strides,
-                                                      std::vector<index_t> b_strides,
-                                                      std::vector<index_t> c_strides,
-                                                      std::vector<index_t> d_strides,
-                                                      std::vector<index_t> e_strides,
-                                                      std::vector<index_t> f_strides,
-                                                      ElementwiseFunctor functor)
    {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
-                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BDataType*>(p_inputs[1]),
-                                          static_cast<const CDataType*>(p_c),
+                                          static_cast<const CDataType*>(p_inputs[2]),
-                                          static_cast<const DDataType*>(p_d),
+                                          static_cast<const DDataType*>(p_inputs[3]),
-                                          static_cast<const EDataType*>(p_e),
+                                          static_cast<const EDataType*>(p_inputs[4]),
-                                          static_cast<FDataType*>(p_f),
+                                          static_cast<FDataType*>(p_outputs[0]),
                                          lengths,
-                                          a_strides,
+                                          input_strides[0],
-                                          b_strides,
+                                          input_strides[1],
-                                          c_strides,
+                                          input_strides[2],
-                                          d_strides,
+                                          input_strides[3],
-                                          e_strides,
+                                          input_strides[4],
-                                          f_strides,
+                                          output_strides[0],
                                          functor);
    }
    static auto MakeInvoker() { return Invoker{}; }
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-};
+    {
+        return std::make_unique<Invoker>();
+    }
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "Device5aryElementwise"
+            << "<"
+            << "NDim = " << NDim
+            << "MPerThread = " << MPerThread
+            << "AScalarPerVector = " << AScalarPerVector
+            << "BScalarPerVector = " << BScalarPerVector
+            << "CScalarPerVector = " << CScalarPerVector
+            << "DScalarPerVector = " << DScalarPerVector
+            << "EScalarPerVector = " << EScalarPerVector
+            << "FScalarPerVector = " << FScalarPerVector
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+}; // namespace device
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <string>

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t Batch) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmPtr = std::unique_ptr<
+    DeviceBatchedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
@@ -7,7 +10,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
 #include "ck/device_utility/device_prop.hpp"
@@ -149,7 +152,7 @@ template <typename ADataType,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector>
 struct DeviceBatchedGemmXdl
-    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+    : public DeviceBatchedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -336,11 +339,11 @@ struct DeviceBatchedGemmXdl
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
-                 index_t BatchCount)
+                 index_t Batch)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
-              BatchCount_(BatchCount),
+              Batch_(Batch),
              a_grid_desc_k0_m_k1_{
                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
              b_grid_desc_k0_n_k1_{
@@ -373,7 +376,7 @@ struct DeviceBatchedGemmXdl
        const ADataType* p_a_grid_;
        const BDataType* p_b_grid_;
        CDataType* p_c_grid_;
-        index_t BatchCount_;
+        index_t Batch_;
        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
        CGridDesc_M_N c_grid_desc_m_n_;
@@ -417,7 +420,7 @@ struct DeviceBatchedGemmXdl
            }
            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.BatchCount_;
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
            const auto K =
                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
@@ -448,7 +451,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.BatchCount_,
+                                                  arg.Batch_,
                                                  arg.a_grid_desc_k0_m_k1_,
                                                  arg.b_grid_desc_k0_n_k1_,
                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
@@ -482,7 +485,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.BatchCount_,
+                                                  arg.Batch_,
                                                  arg.a_grid_desc_k0_m_k1_,
                                                  arg.b_grid_desc_k0_n_k1_,
                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
@@ -536,7 +539,7 @@ struct DeviceBatchedGemmXdl
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
-                             index_t BatchCount)
+                             index_t Batch)
    {
        return Argument{p_a,
                        p_b,
@@ -552,7 +555,7 @@ struct DeviceBatchedGemmXdl
                        a_element_op,
                        b_element_op,
                        c_element_op,
-                        BatchCount};
+                        Batch};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -570,7 +573,7 @@ struct DeviceBatchedGemmXdl
                                                      AElementwiseOperation a_element_op,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
-                                                      index_t BatchCount) override
+                                                      index_t Batch) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
@@ -586,7 +589,7 @@ struct DeviceBatchedGemmXdl
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          BatchCount);
+                                          Batch);
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
@@ -6,6 +9,7 @@
 #include "ck/device_utility/device_prop.hpp"
 #include "ck/device_utility/kernel_launch.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
 namespace ck {
@@ -22,7 +26,7 @@ template <typename ADataType,
          index_t AScalarPerVector,
          index_t BScalarPerVector,
          index_t CScalarPerVector>
-struct DeviceBinaryElementwise : public BaseOperator
+struct DeviceBinaryElementwise : public DeviceElementwise<2, 1, NDim, ElementwiseFunctor>
 {
    static constexpr auto I0 = Number<0>{};
@@ -195,27 +199,30 @@ struct DeviceBinaryElementwise : public BaseOperator
        return true;
    };
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    virtual std::unique_ptr<BaseArgument>
-                                                      const void* p_b,
+    MakeArgumentPointer(std::array<const void*, 2> p_inputs,
-                                                      void* p_c,
+                        std::array<void*, 1> p_outputs,
-                                                      std::vector<index_t> lengths,
+                        std::vector<index_t> lengths,
-                                                      std::vector<index_t> a_strides,
+                        std::vector<std::vector<index_t>> input_strides,
-                                                      std::vector<index_t> b_strides,
+                        std::vector<std::vector<index_t>> output_strides,
-                                                      std::vector<index_t> c_strides,
+                        ElementwiseFunctor functor) override
-                                                      ElementwiseFunctor functor)
    {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
-                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BDataType*>(p_inputs[1]),
-                                          static_cast<CDataType*>(p_c),
+                                          static_cast<CDataType*>(p_outputs[0]),
                                          lengths,
-                                          a_strides,
+                                          input_strides[0],
-                                          b_strides,
+                                          input_strides[1],
-                                          c_strides,
+                                          output_strides[0],
                                          functor);
    }
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    }
+    // polymorphic
    std::string GetTypeString() const override
    {
        auto str = std::stringstream();
@@ -223,7 +230,11 @@ struct DeviceBinaryElementwise : public BaseOperator
        // clang-format off
        str << "DeviceBinaryElementwise"
            << "<"
+            << "NDim = " << NDim
            << "MPerThread = " << MPerThread
+            << "AScalarPerVector = " << AScalarPerVector
+            << "BScalarPerVector = " << BScalarPerVector
+            << "CScalarPerVector = " << CScalarPerVector
            << ">";
        // clang-format on