Commit b79df771 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 05d38218 63914743
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP #ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP #define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#ifndef CK_NOGPU #ifndef CK_NOGPU
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_STATIC_TENSOR_HPP #ifndef CK_STATIC_TENSOR_HPP
#define CK_STATIC_TENSOR_HPP #define CK_STATIC_TENSOR_HPP
......
#ifndef CK_CLUSTER_DESCRIPTOR_HPP // SPDX-License-Identifier: MIT
#define CK_CLUSTER_DESCRIPTOR_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_adaptor.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace ck { namespace ck {
...@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor( ...@@ -30,4 +32,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
} }
} // namespace ck } // namespace ck
#endif
#ifndef CK_MULTI_INDEX_TRANSFORM_HPP // SPDX-License-Identifier: MIT
#define CK_MULTI_INDEX_TRANSFORM_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "multi_index.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/multi_index.hpp"
namespace ck { namespace ck {
...@@ -1950,4 +1952,3 @@ struct Modulo ...@@ -1950,4 +1952,3 @@ struct Modulo
} }
}; };
} // namespace ck } // namespace ck
#endif
#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP // SPDX-License-Identifier: MIT
#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "multi_index_transform.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace ck { namespace ck {
...@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus, ...@@ -126,4 +128,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
return Modulo<Modulus, UpLength>{modulus, up_length}; return Modulo<Modulus, UpLength>{modulus, up_length};
} }
} // namespace ck } // namespace ck
#endif
#ifndef CK_TENSOR_ADAPTOR_HPP // SPDX-License-Identifier: MIT
#define CK_TENSOR_ADAPTOR_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace ck { namespace ck {
...@@ -136,7 +138,11 @@ struct TensorAdaptor ...@@ -136,7 +138,11 @@ struct TensorAdaptor
using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>; using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
public: public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorAdaptor() = default; __host__ __device__ constexpr TensorAdaptor() = default;
#else
__host__ __device__ constexpr TensorAdaptor() : transforms_{}, element_size_{} {}
#endif
__host__ __device__ constexpr TensorAdaptor(const Transforms& transforms) __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
: transforms_{transforms}, element_size_{InitializeElementSize(transforms)} : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
...@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&.. ...@@ -474,4 +480,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
} }
} // namespace ck } // namespace ck
#endif
#ifndef CK_TENSOR_DESCRIPTOR_HPP // SPDX-License-Identifier: MIT
#define CK_TENSOR_DESCRIPTOR_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "multi_index_transform.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform.hpp"
namespace ck { namespace ck {
...@@ -111,7 +113,14 @@ struct TensorDescriptor ...@@ -111,7 +113,14 @@ struct TensorDescriptor
using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>; using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
public: public:
#if 0 // workaround compiler complaint about constexpr
__host__ __device__ constexpr TensorDescriptor() = default; __host__ __device__ constexpr TensorDescriptor() = default;
#else
__host__ __device__ constexpr TensorDescriptor()
: transforms_{}, element_size_{}, element_space_size_{}
{
}
#endif
__host__ __device__ constexpr TensorDescriptor(const Transforms& transforms, __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
ElementSpaceSize element_space_size) ElementSpaceSize element_space_size)
...@@ -602,4 +611,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step( ...@@ -602,4 +611,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{})); TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace ck { namespace ck {
......
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP // SPDX-License-Identifier: MIT
#define TENSOR_SPACE_FILLING_CURVE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "math.hpp" #pragma once
#include "sequence.hpp"
#include "sequence_helper.hpp" #include "ck/utility/math.hpp"
#include "tensor_adaptor.hpp" #include "ck/utility/sequence.hpp"
#include "statically_indexed_array_multi_index.hpp" #include "ck/utility/sequence_helper.hpp"
#include "tuple_helper.hpp" #include "ck/utility/statically_indexed_array_multi_index.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace ck { namespace ck {
...@@ -156,4 +158,3 @@ struct SpaceFillingCurve ...@@ -156,4 +158,3 @@ struct SpaceFillingCurve
}; };
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "tensor_adaptor.hpp" #include "ck/utility/common_header.hpp"
#include "threadwise_tensor_slice_transfer_v4r1.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "threadwise_contraction_dl.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
namespace ck { namespace ck {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP #ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP #define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP #ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP #define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "threadwise_tensor_slice_transfer.hpp" #include "ck/utility/common_header.hpp"
#include "xdlops_gemm.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "tensor_adaptor.hpp" #include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
#include "thread_group.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
namespace ck { namespace ck {
...@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -438,7 +441,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple(n0, I0, I0, I0), make_tuple(n0, I0, I0, I0),
b_thread_buf); b_thread_buf);
}); });
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
// NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
// the first, as we can shorten non-MAC cluster a bit and there's no observable negative // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
// impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
...@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -448,7 +451,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
if constexpr(k.value != 0 || KPerInnerLoop == KPerThread) if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
{ {
asm volatile("s_barrier" ::); asm volatile("s_barrier" ::);
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
} }
static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
static_for<0, MRepeat, 1>{}([&](auto m0) { static_for<0, MRepeat, 1>{}([&](auto m0) {
...@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -480,9 +483,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 && k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
n0.value == NRepeat - 1) n0.value == NRepeat - 1)
{ {
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
block_sync_lds(); block_sync_lds();
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
} }
// TODO: insert setprio in more precise manner since we // TODO: insert setprio in more precise manner since we
...@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 ...@@ -493,16 +496,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
c_thread_buf.GetVectorTypeReference(Number<c_offset>{})); c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
{ {
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
__builtin_amdgcn_s_setprio(1); __builtin_amdgcn_s_setprio(1);
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
} }
}); });
}); });
}); });
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
__builtin_amdgcn_s_setprio(0); __builtin_amdgcn_s_setprio(0);
__builtin_amdgcn_sched_barrier(); __builtin_amdgcn_sched_barrier(0);
}); });
} }
......
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP // SPDX-License-Identifier: MIT
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "cluster_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v5r1.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
namespace ck { namespace ck {
...@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1 ...@@ -152,4 +154,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
}; };
} // namespace ck } // namespace ck
#endif
/******************************************************************************* // SPDX-License-Identifier: MIT
* // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
* #pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
* #include "ck/tensor_description/cluster_descriptor.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy #include "ck/utility/reduction_common.hpp"
* of this software and associated documentation files (the "Software"), to deal #include "ck/utility/reduction_functions_accumulate.hpp"
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "reduction_common.hpp"
#include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
namespace ck { namespace ck {
...@@ -45,7 +21,9 @@ template <typename AccDataType, ...@@ -45,7 +21,9 @@ template <typename AccDataType,
typename ThreadClusterLengths_M_K, typename ThreadClusterLengths_M_K,
typename ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
typename OpReduce, typename OpReduce,
bool PropagateNan> bool PropagateNan,
typename Accumulation =
detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
struct PartitionedBlockwiseReduction struct PartitionedBlockwiseReduction
{ {
static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1), static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
...@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction ...@@ -62,8 +40,6 @@ struct PartitionedBlockwiseReduction
static constexpr auto thread_cluster_desc = static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>;
template <typename BufferType> template <typename BufferType>
__device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value) __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
{ {
...@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction ...@@ -113,13 +89,16 @@ struct PartitionedBlockwiseReduction
// 3) in_out_value/in_out_index is the input data in vgpr from each thread // 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread // 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// clang-format on // clang-format on
template <typename AccDataType, template <
typename IndexDataType, typename AccDataType,
index_t BlockSize, typename IndexDataType,
typename ThreadClusterLengths_M_K, index_t BlockSize,
typename ThreadClusterArrangeOrder, typename ThreadClusterLengths_M_K,
typename OpReduce, typename ThreadClusterArrangeOrder,
bool PropagateNan> typename OpReduce,
bool PropagateNan,
typename Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
struct PartitionedBlockwiseReductionWithIndex struct PartitionedBlockwiseReductionWithIndex
{ {
static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1), static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
...@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex ...@@ -136,9 +115,6 @@ struct PartitionedBlockwiseReductionWithIndex
static constexpr auto thread_cluster_desc = static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using Accumulation =
detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>;
// This interface accumulates on both data values and indices // This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType> template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce(BufferType& work_val_buffer, __device__ static void Reduce(BufferType& work_val_buffer,
...@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex ...@@ -193,6 +169,4 @@ struct PartitionedBlockwiseReductionWithIndex
}; };
}; };
}; // end of namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v3r1.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
namespace ck { namespace ck {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
namespace ck { namespace ck {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "cluster_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "threadwise_tensor_slice_transfer_v6r2.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
namespace ck { namespace ck {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment