Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

cba8f7f2 · Anthony Chang · cc50b687 · b653c5eb · cba8f7f2 · cba8f7f2
Commit cba8f7f2 authored Jun 26, 2022 by Anthony Chang
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
 #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
-#include "xdlops_gemm.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
-#include "thread_group.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   make_tuple(n0, I0, I0, I0),
                                   b_thread_buf);
            });
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
            // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
            // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
            // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
            {
                asm volatile("s_barrier" ::);
-                __builtin_amdgcn_sched_barrier();
+                __builtin_amdgcn_sched_barrier(0);
            }
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                     k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
                                     n0.value == NRepeat - 1)
                        {
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                            block_sync_lds();
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                        }
                        // TODO: insert setprio in more precise manner since we
@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
                        {
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                            __builtin_amdgcn_s_setprio(1);
-                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_sched_barrier(0);
                        }
                    });
                });
            });
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
            __builtin_amdgcn_s_setprio(0);
-            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_sched_barrier(0);
        });
    }

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
-#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+// SPDX-License-Identifier: MIT
-#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "common_header.hpp"
+#pragma once
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/utility/common_header.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
 namespace ck {
@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
+#pragma once
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
+#include "ck/tensor_description/cluster_descriptor.hpp"
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+#include "ck/utility/reduction_common.hpp"
- * of this software and associated documentation files (the "Software"), to deal
+#include "ck/utility/reduction_functions_accumulate.hpp"
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#include "reduction_common.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "cluster_descriptor.hpp"
 namespace ck {
@@ -45,7 +21,9 @@ template <typename AccDataType,
          typename ThreadClusterLengths_M_K,
          typename ThreadClusterArrangeOrder,
          typename OpReduce,
-          bool PropagateNan>
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
 struct PartitionedBlockwiseReduction
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
    template <typename BufferType>
    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
    {
@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
 //  3) in_out_value/in_out_index is the input data in vgpr from each thread
 //  4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
 // clang-format on
-template <typename AccDataType,
+template <
-          typename IndexDataType,
+    typename AccDataType,
-          index_t BlockSize,
+    typename IndexDataType,
-          typename ThreadClusterLengths_M_K,
+    index_t BlockSize,
-          typename ThreadClusterArrangeOrder,
+    typename ThreadClusterLengths_M_K,
-          typename OpReduce,
+    typename ThreadClusterArrangeOrder,
-          bool PropagateNan>
+    typename OpReduce,
+    bool PropagateNan,
+    typename Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
 struct PartitionedBlockwiseReductionWithIndex
 {
    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
    static constexpr auto thread_cluster_desc =
        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-    using Accumulation =
-        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
    // This interface accumulates on both data values and indices
    template <typename BufferType, typename IdxBufferType>
    __device__ static void Reduce(BufferType& work_val_buffer,
@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
    };
 };
-}; // end of namespace ck
+} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v3r1.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v6r2.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v6r3.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CONVOLUTION_BACKWARD_DATA_SPECIALIZATION
 #define CONVOLUTION_BACKWARD_DATA_SPECIALIZATION

--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 namespace ck {

--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CONVOLUTION_FORWARD_SPECIALIZATION
 #define CONVOLUTION_FORWARD_SPECIALIZATION

--- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
+#include <vector>
-#include "device_base.hpp"
-#include "gridwise_5ary_Elementwise_1d.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_layout.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp"
+#include "ck/device_utility/device_prop.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -325,7 +332,7 @@ struct Device5AryElementwise : public BaseOperator
    static auto MakeInvoker() { return Invoker{}; }
    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
-}; // namespace device
+};
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <string>
-#include "stream_config.hpp"
+#include "ck/stream_config.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
-#include "device_gemm_reduce.hpp"
+#include "ck/utility/common_header.hpp"
-#include "common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_layout.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
-#include "gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/device_utility/device_prop.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
-#ifndef DEVICE_BATCHED_GEMM_XDL_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_BATCHED_GEMM_XDL_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
-#include "device_base.hpp"
+#include "ck/utility/common_header.hpp"
-#include "device_gemm.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/device_utility/device_prop.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -616,4 +620,3 @@ struct DeviceBatchedGemmXdl
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <vector>
-#include "device.hpp"
+#include "ck/device_utility/device_prop.hpp"
-#include "device_base.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
-#include "gridwise_binary_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #pragma once
 #include "device_base.hpp"

--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #pragma once
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
-#include "device_gemm.hpp"
+#include "ck/utility/common_header.hpp"
-#include "device_cgemm.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_cgemm.hpp"
-#include "gridwise_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp"
-#include "gridwise_binary_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
-#include "tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/device_utility/device_prop.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
 namespace ck {
 namespace tensor_operation {