Merge remote-tracking branch 'origin/develop' into cpu_avx2

b79df771 · carlushuang · 05d38218 · 63914743 · b79df771 · b79df771
Commit b79df771 authored Jul 12, 2022 by carlushuang
20 changed files
--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP

--- a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
 #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP

--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #ifndef CK_NOGPU

--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_STATIC_TENSOR_HPP
 #define CK_STATIC_TENSOR_HPP

--- a/include/ck/tensor_description/cluster_descriptor.hpp
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
-#ifndef CK_CLUSTER_DESCRIPTOR_HPP
+// SPDX-License-Identifier: MIT
-#define CK_CLUSTER_DESCRIPTOR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "tensor_adaptor.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
-#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+// SPDX-License-Identifier: MIT
-#define CK_MULTI_INDEX_TRANSFORM_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "multi_index.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/multi_index.hpp"
 namespace ck {
@@ -1950,4 +1952,3 @@ struct Modulo
    }
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
-#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+// SPDX-License-Identifier: MIT
-#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "multi_index_transform.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
 namespace ck {
@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
    return Modulo<Modulus, UpLength>{modulus, up_length};
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
-#ifndef CK_TENSOR_ADAPTOR_HPP
+// SPDX-License-Identifier: MIT
-#define CK_TENSOR_ADAPTOR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
 namespace ck {
@@ -136,7 +138,11 @@ struct TensorAdaptor
    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
    public:
+#if 0 // workaround compiler complaint about constexpr
    __host__ __device__ constexpr TensorAdaptor() = default;
+#else
+    __host__ __device__ constexpr TensorAdaptor() : transforms_{}, element_size_{} {}
+#endif
    __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
        : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
-#ifndef CK_TENSOR_DESCRIPTOR_HPP
+// SPDX-License-Identifier: MIT
-#define CK_TENSOR_DESCRIPTOR_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "multi_index_transform.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
 namespace ck {
@@ -111,7 +113,14 @@ struct TensorDescriptor
    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
    public:
+#if 0 // workaround compiler complaint about constexpr
    __host__ __device__ constexpr TensorDescriptor() = default;
+#else
+    __host__ __device__ constexpr TensorDescriptor()
+        : transforms_{}, element_size_{}, element_space_size_{}
+    {
+    }
+#endif
    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
                                                   ElementSpaceSize element_space_size)
@@ -602,4 +611,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
    TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
 namespace ck {

--- a/include/ck/utility/tensor_space_filling_curve.hpp
+++ b/include/ck/utility/tensor_space_filling_curve.hpp
-#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
+// SPDX-License-Identifier: MIT
-#define TENSOR_SPACE_FILLING_CURVE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "math.hpp"
+#pragma once
-#include "sequence.hpp"
-#include "sequence_helper.hpp"
+#include "ck/utility/math.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/utility/sequence.hpp"
-#include "statically_indexed_array_multi_index.hpp"
+#include "ck/utility/sequence_helper.hpp"
-#include "tuple_helper.hpp"
+#include "ck/utility/statically_indexed_array_multi_index.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -156,4 +158,3 @@ struct SpaceFillingCurve
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
-#include "threadwise_contraction_dl.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
-#include "xdlops_gemm.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
-#include "thread_group.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   make_tuple(n0, I0, I0, I0),
                                   b_thread_buf);
            });
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
            // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
            // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
            // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
            {
                asm volatile("s_barrier" ::);
-                __builtin_amdgcn_sched_barrier();
+                __builtin_amdgcn_sched_barrier(0);
            }
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                     k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
                                     n0.value == NRepeat - 1)
                        {
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                            block_sync_lds();
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                        }
                        // TODO: insert setprio in more precise manner since we
@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                            __builtin_amdgcn_s_setprio(1);
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                        }
                    });
                });
            });
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
            __builtin_amdgcn_s_setprio(0);
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
        });
    }

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
-#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+// SPDX-License-Identifier: MIT
-#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/utility/common_header.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
 namespace ck {
@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
+#pragma once
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
+#include "ck/tensor_description/cluster_descriptor.hpp"
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+#include "ck/utility/reduction_common.hpp"
- * of this software and associated documentation files (the "Software"), to deal
+#include "ck/utility/reduction_functions_accumulate.hpp"
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#include "reduction_common.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "cluster_descriptor.hpp"
 namespace ck {
@@ -45,7 +21,9 @@ template <typename AccDataType,
          typename ThreadClusterLengths_M_K,
          typename ThreadClusterArrangeOrder,
          typename OpReduce,
-          bool PropagateNan>
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
 struct PartitionedBlockwiseReduction
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
    template <typename BufferType>
    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
    {
@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
 //  3) in_out_value/in_out_index is the input data in vgpr from each thread
 //  4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
 // clang-format on
-template <typename AccDataType,
+template <
-          typename IndexDataType,
+    typename AccDataType,
-          index_t BlockSize,
+    typename IndexDataType,
-          typename ThreadClusterLengths_M_K,
+    index_t BlockSize,
-          typename ThreadClusterArrangeOrder,
+    typename ThreadClusterLengths_M_K,
-          typename OpReduce,
+    typename ThreadClusterArrangeOrder,
-          bool PropagateNan>
+    typename OpReduce,
+    bool PropagateNan,
+    typename Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
 struct PartitionedBlockwiseReductionWithIndex
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-    using Accumulation =
-        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
    // This interface accumulates on both data values and indices
    template <typename BufferType, typename IdxBufferType>
    __device__ static void Reduce(BufferType& work_val_buffer,
@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
    };
 };
-}; // end of namespace ck
+} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v3r1.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v6r2.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
 namespace ck {