Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

cba8f7f2 · Anthony Chang · cc50b687 · b653c5eb · cba8f7f2 · cba8f7f2
Commit cba8f7f2 authored Jun 26, 2022 by Anthony Chang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/device_utility/device_prop.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmax : public BaseOperator
+{
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
+    using Reduction = DeviceReduceMultiBlock<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             reduce::Add,
+                                             PassThrough, // InElementwiseOperation
+                                             PassThrough, // AccElementwiseOperation
+                                             InMemoryDataOperationEnum::Set,
+                                             false, // PropagateNan
+                                             false, // OutputIndex
+                                             false, // HaveIndexInputIfOutputIndex
+                                             BlockSize,
+                                             MThreadClusterSize,
+                                             KThreadClusterSize,
+                                             MThreadSliceSize,
+                                             KThreadSliceSize,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1>; // OutDstVectorSize
+    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                    OutDataType,
+                                                    AccDataType,
+                                                    GridDesc_M_K,
+                                                    BlockSize,
+                                                    MThreadClusterSize,
+                                                    KThreadClusterSize,
+                                                    MThreadSliceSize,
+                                                    KThreadSliceSize,
+                                                    InSrcVectorDim,
+                                                    InSrcVectorSize,
+                                                    OutDstVectorSize>;
+    struct Argument : public Reduction::Argument
+    {
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev)
+            : Reduction::Argument(inLengths,
+                                  inStrides,
+                                  {},
+                                  {},
+                                  reduceDims,
+                                  0.0f, // alpha
+                                  0.0f, // beta
+                                  in_dev,
+                                  nullptr,
+                                  out_dev,
+                                  nullptr,
+                                  PassThrough{},
+                                  PassThrough{}),
+              // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
+              // float32 precision. Make it support any data type so the fields can be removed.
+              alpha_(alpha),
+              beta_(beta)
+        {
+            // std::cout << "blkGroupSize= " << this->blkGroupSize
+            //           << ", numBlockTileIteration= " << this->numBlockTileIteration
+            //           << ", gridSize=" << this->gridSize
+            //           << ", invariant_total_length=" << this->invariant_total_length <<
+            //           std::endl;
+        }
+        AccDataType alpha_;
+        AccDataType beta_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto kernel_main =
+                kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m_k,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_);
+            return (avg_time);
+        };
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(!Reduction::IsSupportedArgument(p_arg_))
+        {
+            return false;
+        }
+        if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                      const std::vector<index_t> inStrides,
+                                                      const std::vector<int> reduceDims,
+                                                      AccDataType alpha,
+                                                      AccDataType beta,
+                                                      const void* in_dev,
+                                                      void* out_dev)
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev));
+    };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceReduceSoftmax<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iostream>
 #include <vector>
-#include "device.hpp"
+#include "ck/device_utility/device_prop.hpp"
-#include "device_base.hpp"
+#include "ck/device_utility/kernel_launch.hpp"
-#include "gridwise_unary_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
-#ifndef GEMM_SPECIALIZATION
+// SPDX-License-Identifier: MIT
-#define GEMM_SPECIALIZATION
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 namespace ck {
 namespace tensor_operation {
@@ -20,4 +22,3 @@ enum struct GemmSpecialization
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
+#pragma once
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
+#include "ck/utility/reduction_operator.hpp"
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+#include "ck/utility/reduction_enums.hpp"
- * of this software and associated documentation files (the "Software"), to deal
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
- * in the Software without restriction, including without limitation the rights
+// FIXME: can it be replaced with ck::Tuple?
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
-#define CK_REDUCTION_OPERATOR_MAPPING_HPP
-#include "reduction_operator.hpp"
-#include "reduction_enums.hpp"
-#include "element_wise_operation.hpp"
 #include <tuple>
 namespace ck {
@@ -205,6 +183,4 @@ struct reduce_unary_operator<ReduceTensorOp::NORM2, false, true>
    };
 };
-} // end of namespace ck
+} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 namespace ck {

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
 #pragma once
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "math_v2.hpp"
+#include "ck/utility/math_v2.hpp"
-#include "unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-#include "binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "math_v2.hpp"
+#include "ck/utility/math_v2.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
-#ifndef UTILITY_BLOCK_TO_CTILE_MAP
+// SPDX-License-Identifier: MIT
-#define UTILITY_BLOCK_TO_CTILE_MAP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "utility/math.hpp"
+#pragma once
-#include "utility/number.hpp"
-#include "tensor_description/tensor_adaptor.hpp"
+#include "ck/utility/math.hpp"
-#include "tensor_description/multi_index_transform_helper.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
 namespace ck {
@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
 }
 } // namespace ck
-#endif // UTILITY_BLOCK_TO_CTILE_MAP
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
+#pragma once
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
+#include "ck/utility/reduction_common.hpp"
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+#include "ck/utility/reduction_operator.hpp"
- * of this software and associated documentation files (the "Software"), to deal
+#include "ck/utility/reduction_functions_accumulate.hpp"
- * in the Software without restriction, including without limitation the rights
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
- * copies of the Software, and to permit persons to whom the Software is
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
- * furnished to do so, subject to the following conditions:
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
-#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "reduction_functions_blockwise.hpp"
-#include "reduction_functions_threadwise.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "element_wise_operation.hpp"
 namespace ck {
@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
-/*******************************************************************************
+// SPDX-License-Identifier: MIT
- *
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
- * MIT License
- *
+#pragma once
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
+#include "ck/utility/data_type.hpp"
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+#include "ck/utility/reduction_common.hpp"
- * of this software and associated documentation files (the "Software"), to deal
+#include "ck/utility/reduction_operator.hpp"
- * in the Software without restriction, including without limitation the rights
+#include "ck/utility/reduction_functions_accumulate.hpp"
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
- * copies of the Software, and to permit persons to whom the Software is
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
- * furnished to do so, subject to the following conditions:
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
-#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
-#include "data_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "reduction_functions_threadwise.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "element_wise_operation.hpp"
 namespace ck {
@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
-#include "data_type.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
 #define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
-#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
-#include "thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
-#include "gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
-#include "reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
-#include "blockwise_gemm_dl_v2r3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
-#include "blockwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp"
-#include "threadwise_tensor_slice_set.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
 #define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_GRIDWISE_GEMM_V2_HPP
 #define CK_GRIDWISE_GEMM_V2_HPP

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_GRIDWISE_GEMM_V3_HPP
 #define CK_GRIDWISE_GEMM_V3_HPP

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
-#include "blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
-#include "thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
-#include "thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
-#include "gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 namespace ck {