Merge branch 'develop' into gridwise_gemm_double_buffer

7506342c · ltqin · 26b4fe97 · acbd7bd7 · 7506342c · 7506342c
Commit 7506342c authored Jan 12, 2022 by ltqin
20 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP
-#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP

 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
@@ -47,6 +47,7 @@ struct lambda_scalar_per_access_for_src_and_dst
 //   4. Use thread buffer
 template <typename SliceLengths,
          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
          InMemoryDataOperationEnum_t DstInMemOp,
          typename SrcData,
          typename DstData,
@@ -66,7 +67,7 @@ template <typename SliceLengths,
          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
                                           // save addr computation
-struct ThreadwiseTensorSliceTransfer_v3r2
+struct ThreadwiseTensorSliceTransfer_v3r1
 {
    static constexpr index_t nDim = SliceLengths::Size();
    using Index                   = MultiIndex<nDim>;
@@ -77,15 +78,17 @@ struct ThreadwiseTensorSliceTransfer_v3r2
    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));

-    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r2(
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
        const SrcDesc& src_desc,
        const Index& src_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
        const DstDesc& dst_desc,
        const Index& dst_slice_origin,
-        const SrcElementwiseOperation& src_element_op)
+        const DstElementwiseOperation& dst_element_op)
        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
-          src_element_op_(src_element_op)
+          src_element_op_(src_element_op),
+          dst_element_op_(dst_element_op)
    {
    }

@@ -165,7 +168,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                static_for<1, nDim, 1>{}([&](auto i) {
                    index_t tmp = ordered_src_access_idx[I0];

-                    static_for<0, i, 1>{}([&](auto j) {
+                    static_for<1, i, 1>{}([&](auto j) {
                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
                    });

@@ -412,7 +415,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                static_for<1, nDim, 1>{}([&](auto i) {
                    index_t tmp = ordered_dst_access_idx[I0];

-                    static_for<0, i, 1>{}([&](auto j) {
+                    static_for<1, i, 1>{}([&](auto j) {
                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
                    });

@@ -442,13 +445,24 @@ struct ThreadwiseTensorSliceTransfer_v3r2
            const bool is_dst_valid =
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);

-            using dst_vector_t = typename vector_type_maker_t<DstData, DstScalarPerVector>::type;
+            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;

-            // copy data from dst_thread_scratch_ to dst_buf
+            // copy data from dst_thread_scratch_ into dst_vector_container
+            auto dst_vector_container = dst_vector_type{
+                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
+
+            // apply DstElementwiseOperation on dst_vector_container
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                dst_vector_container.template AsType<DstData>()(i) =
+                    dst_element_op_(dst_vector_container.template AsType<DstData>()[i]);
+            });
+
+            // copy data from dst_vector_container to dst_buf
            dst_buf.template Set<dst_vector_t>(
                dst_coord_.GetOffset(),
                is_dst_valid,
-                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq));
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);

            constexpr auto move_on_dim = [&]() constexpr
            {
@@ -498,7 +512,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
    template <typename SrcBuffer>
    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
    {
-        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+        constexpr index_t ntransform_src = remove_cvref_t<SrcDesc>::GetNumOfTransform();

        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};

@@ -512,7 +526,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2
    template <typename DstBuffer>
    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
    {
-        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+        // TODO: why need remove_cvref_t ?
+        constexpr index_t ntransform_dst = remove_cvref_t<DstDesc>::GetNumOfTransform();

        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};

@@ -548,7 +563,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
            static_for<1, nDim, 1>{}([&](auto i) {
                index_t tmp = ordered_src_access_lengths[I0] - 1;

-                static_for<0, i, 1>{}([&](auto j) {
+                static_for<1, i, 1>{}([&](auto j) {
                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
                });

@@ -608,7 +623,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
            static_for<1, nDim, 1>{}([&](auto i) {
                index_t tmp = ordered_dst_access_lengths[I0] - 1;

-                static_for<0, i, 1>{}([&](auto j) {
+                static_for<1, i, 1>{}([&](auto j) {
                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
                });

@@ -811,6 +826,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
    SrcCoord src_coord_;
    DstCoord dst_coord_;
    const SrcElementwiseOperation src_element_op_;
+    const DstElementwiseOperation dst_element_op_;
 };

 } // namespace ck

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r3.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v4r1.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v4r1.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -35,8 +35,8 @@
 #include "dynamic_buffer.hpp"
 #include "is_known_at_compile_time.hpp"
 #include "transpose_vectors.hpp"
-
 #include "inner_product.hpp"
+#include "element_wise_operation.hpp"

 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM

--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -24,12 +24,16 @@
 #define CK_MIN_BLOCK_PER_CU 2
 #endif

-// buffer resourse
+// GPU-specific parameters
 #if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
    defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A)
+// buffer resourse
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+// wave size
+#define CK_GPU_WAVE_SIZE 64
 #elif defined(CK_AMD_GPU_GFX1030)
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#define CK_GPU_WAVE_SIZE 32
 #endif

 // FMA instruction
@@ -141,6 +145,10 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1
 #endif

+#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
+#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1
+#endif
+
 namespace ck {

 enum InMemoryDataOperationEnum_t
@@ -152,7 +160,7 @@ enum InMemoryDataOperationEnum_t

 enum ActivTypeEnum_t
 {
-    None = 0,
+    None,
    LeakyRelu,
    Sigmoid
 };

--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -5,8 +5,12 @@

 namespace ck {

+__device__ constexpr index_t get_wave_size() { return CK_GPU_WAVE_SIZE; }
+
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

+__device__ index_t get_wave_local_1d_id() { return threadIdx.x / get_wave_size(); }
+
 __device__ index_t get_block_1d_id() { return blockIdx.x; }

 } // namespace ck

--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp
--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
--- a/device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
--- a/device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
--- a/device_operation/device_conv_xdl_instance_f16_f16_f16_nhwc_kyxc_nhwk.cpp
+++ b/device_operation/device_conv_xdl_instance_f16_f16_f16_nhwc_kyxc_nhwk.cpp
--- a/device_operation/device_conv_xdl_instance_f32_f32_f32_nhwc_kyxc_nhwk.cpp
+++ b/device_operation/device_conv_xdl_instance_f32_f32_f32_nhwc_kyxc_nhwk.cpp
--- a/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp