merge from the public repo

4a106f7d · illsilin · a73ab0d8 · 306fd506 · 4a106f7d · 4a106f7d
Commit 4a106f7d authored Nov 01, 2023 by illsilin
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include "device_base.hpp"

--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M0, M1, ... K0, K1, ...], ...
+//   input : B0[N0, N1, ... K0, K1, ...], ...
+//   input : D0[M0, M1, ... N0, N1, ...], D1[M0, M1, ... N0, N1, ...], ...
+//   output : E[M0, M1, ... N0, N1, ...]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_lengths,
+                        const std::array<std::vector<index_t>, NumATensor>& a_ms_ks_strides,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_lengths,
+                        const std::array<std::vector<index_t>, NumBTensor>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& d_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_length,
+                        const std::vector<index_t>& e_ms_ns_stride,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/**
+ * \brief Convolution Tensor Rearrange.
+ *
+ * This Device operator supports converting an image to
+ * the GEMM representation (Image to Column) and
+ * converting a GEMM form to the image (Column to Image).
+ * Supported layouts:
+ * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z *  Y * X * C]
+ * [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z *  Y * X * C]
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ImageLayout Input Layout.
+ * \tparam InputDataType Input Data Type.
+ * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
+ */
+template <index_t NDimSpatial,
+          typename ImageLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
+{
+
+    /**
+     * \brief Make argument pointer for image to column.
+     *
+     * \param p_in A pointer to the device memory of the input image.
+     * \param p_out A pointer to the device memory of the output.
+     * \param G Convolution number of groups.
+     * \param N Convolution batch size.
+     * \param C Convolution number of channels.
+     * \param input_spatial_lengths Input spatial lengths.
+     * \param filter_spatial_lengths Filter spatial lengths.
+     * \param output_spatial_lengths Output spatial lengths.
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
+     * \param gemm_g_m_k_strides Gemm form strides.
+     * \param conv_filter_strides Convolution filter strides.
+     * \param conv_filter_dilations Convolution filter dilations.
+     * \param input_left_pads Convolution left pads.
+     * \param input_right_pads Convolution right pads.
+     * \return Pointer to the argument.
+     */
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        void* p_out,
+                        const ck::index_t G,
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 3>& gemm_g_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -20,7 +20,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 struct DeviceGemmSplitK : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
@@ -48,7 +49,8 @@ template <typename ALayout,
          typename CDataType,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
-          typename CElementwiseOperation>
+          typename CElementwiseOperation,
+          typename ComputeType = CDataType>
 using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             BLayout,
                                                             CLayout,
@@ -57,7 +59,8 @@ using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
                                                             CDataType,
                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
-                                                             CElementwiseOperation>>;
+                                                             CElementwiseOperation,
+                                                             ComputeType>>;

 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_streamk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmStreamK : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t NumSKBlocks = 0) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmStreamKPtr = std::unique_ptr<DeviceGemmStreamK<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck