Commit b79df771 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 05d38218 63914743
#ifndef CK_THREADWISE_TENSOR_SET_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SET_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace ck { namespace ck {
...@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1 ...@@ -56,4 +58,3 @@ struct ThreadwiseTensorSliceSet_v1
}; };
} // namespace ck } // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp" #include "ck/tensor_description/tensor_space_filling_curve.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace ck { namespace ck {
...@@ -234,9 +236,14 @@ template <typename SrcData, ...@@ -234,9 +236,14 @@ template <typename SrcData,
index_t SrcScalarPerVector, index_t SrcScalarPerVector,
index_t SrcScalarStrideInVector, index_t SrcScalarStrideInVector,
bool SrcResetCoordinateAfterRun, bool SrcResetCoordinateAfterRun,
bool InvalidElementAsNaN = false,
typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false> typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
struct ThreadwiseTensorSliceTransfer_v2 struct ThreadwiseTensorSliceTransfer_v2
{ {
static_assert((InvalidElementAsNaN && !std::is_integral<DstData>::value) ||
(!InvalidElementAsNaN),
"Filling invalid element as NaN is only for floating point types");
static constexpr index_t nDim = SliceLengths::Size(); static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
...@@ -316,8 +323,18 @@ struct ThreadwiseTensorSliceTransfer_v2 ...@@ -316,8 +323,18 @@ struct ThreadwiseTensorSliceTransfer_v2
dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
i * src_scalar_step_in_vector); i * src_scalar_step_in_vector);
dst_buf(Number<dst_offset>{}) = if constexpr(InvalidElementAsNaN)
type_convert<DstData>(src_vector.template AsType<SrcData>()[i]); {
dst_buf(Number<dst_offset>{}) =
is_src_valid
? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
: NumericLimits<DstData>::QuietNaN();
}
else
{
dst_buf(Number<dst_offset>{}) =
type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
}
}); });
if constexpr(idx_1d.value != num_access - 1) if constexpr(idx_1d.value != num_access - 1)
...@@ -1168,4 +1185,3 @@ struct ThreadwiseTensorSliceTransfer_v4 ...@@ -1168,4 +1185,3 @@ struct ThreadwiseTensorSliceTransfer_v4
}; };
} // namespace ck } // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R1_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "static_tensor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor/static_tensor.hpp"
namespace ck { namespace ck {
...@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -789,4 +791,3 @@ struct ThreadwiseTensorSliceTransfer_v3r1
}; };
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP #ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP #define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
......
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
namespace ck { namespace ck {
// Assume: // Assume:
...@@ -171,4 +173,3 @@ struct ThreadwiseTensorSliceTransfer_v4r1 ...@@ -171,4 +173,3 @@ struct ThreadwiseTensorSliceTransfer_v4r1
}; };
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "common_header.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace ck { namespace ck {
......
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace ck { namespace ck {
...@@ -206,7 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1 ...@@ -206,7 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1
SrcCoord src_coord_; SrcCoord src_coord_;
DstCoord dst_coord_; DstCoord dst_coord_;
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
}; // namespace ck };
} // namespace ck } // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace ck { namespace ck {
...@@ -256,4 +258,3 @@ struct ThreadwiseTensorSliceTransfer_v6r2 ...@@ -256,4 +258,3 @@ struct ThreadwiseTensorSliceTransfer_v6r2
}; };
} // namespace ck } // namespace ck
#endif
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP // SPDX-License-Identifier: MIT
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/utility/common_header.hpp"
#include "tensor_space_filling_curve.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace ck { namespace ck {
...@@ -306,4 +308,3 @@ struct ThreadwiseTensorSliceTransfer_v6r3 ...@@ -306,4 +308,3 @@ struct ThreadwiseTensorSliceTransfer_v6r3
}; };
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
namespace ck {
// Thread-level multi-source, multi-destination tensor slice data movement
// Assume:
// 1. All sources and destinations are DynamicBuffer
// 2. Same VectorDim and ScalerPerVector for all sources and destinations
// 3. DstInMemOps are per destination tensor
// 4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
// 5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
// 6. Does not need to know src_descs and dst_descs at compile-time
// 7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
//
// Does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
// 2. Pass tensor descritpors by reference (or tuple of references)
// 3. Does not keep reference to tensor descriptor
// 4. Does not construct new tensor coordinate when call Run()
template <typename SrcDatas,
typename DstDatas,
typename SrcDescs,
typename DstDescs,
typename ElementwiseOperation,
typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
typename SliceLengths,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector,
typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
typename DstResetCoordinateAfterRunFlags> // Sequence<bool ...>
struct ThreadwiseTensorSliceTransfer_v7
{
static constexpr auto I0 = Number<0>{};
static constexpr index_t nDim = SliceLengths::Size();
static constexpr index_t nSrc = SrcDescs::Size();
static constexpr index_t nDst = DstDescs::Size();
using Index = MultiIndex<nDim>;
// return a tuple of coordiantes for a tuple of tensor
template <typename Descs,
typename Indices,
enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
{
return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
Number<Descs::Size()>{});
}
using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
// scalar per access on each dim
// FIXME: don't use lambda_scalar_per_access
static constexpr auto scalar_per_access = generate_sequence(
detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
using SpaceFillingCurve =
SpaceFillingCurve<SliceLengths, DimAccessOrder, remove_cv_t<decltype(scalar_per_access)>>;
__device__ constexpr ThreadwiseTensorSliceTransfer_v7(
const SrcDescs& src_descs,
const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
const DstDescs& dst_descs,
const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
const ElementwiseOperation& element_op)
: src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
element_op_(element_op)
{
static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
"wrong! cannot evenly divide");
}
template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
__device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
const Indices& src_slice_origin_idxs)
{
static_for<0, nSrc, 1>{}([&](auto i) {
src_coords_(i) = make_tensor_coordinate(src_descs[i], src_slice_origin_idxs[i]);
});
}
template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
__device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
const Indices& dst_slice_origin_idxs)
{
static_for<0, nDst, 1>{}([&](auto i) {
dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
});
}
// SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
// SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
// DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
// DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
template <typename SrcBuffers,
typename DstBuffers,
enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
DstDescs::Size() == DstBuffers::Size(),
bool> = false>
__device__ void Run(const SrcDescs& src_descs,
const SrcBuffers& src_bufs,
const DstDescs& dst_descs,
DstBuffers dst_bufs)
{
auto generate_vectors = [&](auto data_types) {
constexpr index_t num = data_types.Size();
return generate_tuple(
[&](auto i) {
using DataType = remove_cvref_t<decltype(data_types[i])>;
return vector_type_maker_t<DataType, ScalarPerVector>{};
},
Number<num>{});
};
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
// loop over space-filling curve
static_for<0, num_access, 1>{}([&](auto iAccess) {
auto src_vectors = generate_vectors(SrcDatas{});
auto dst_vectors = generate_vectors(DstDatas{});
// copy data from src_bufs into src_vectors
static_for<0, nSrc, 1>{}([&](auto i) {
using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
const bool is_src_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
src_coords_[i]);
src_vectors(i).template AsType<src_vector_t>()(I0) =
src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(),
is_src_valid);
});
// apply pointwise function
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
// get reference to src data
const auto src_data_refs = generate_tie(
// return type should be lvalue
[&](auto iSrc) -> const auto& {
using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
return src_vectors[iSrc].template AsType<SrcData>()[i];
},
Number<nSrc>{});
// get reference to dst data
auto dst_data_refs = generate_tie(
// return type should be lvalue
[&](auto iDst) -> auto& {
using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
return dst_vectors(iDst).template AsType<DstData>()(i);
},
Number<nDst>{});
// apply pointwise function
// pointwise function signature:
// element_op_(dst_data_refs[I0],
// dst_data_refs[I1],
// ...,
// src_data_refs[I0],
// src_data_refs[I1],
// ...)
unpack2(element_op_, dst_data_refs, src_data_refs);
});
// copy data from buf_vectors into dst_bufs
static_for<0, nDst, 1>{}([&](auto i) {
using dst_vector_t = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
dst_coords_[i]);
constexpr InMemoryDataOperationEnum DstInMemOp =
static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
dst_coords_[i].GetOffset(),
is_dst_valid,
dst_vectors[i].template AsType<dst_vector_t>()[I0]);
});
// move coordinate
if constexpr(iAccess.value != num_access - 1)
{
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(iAccess);
static_for<0, nSrc, 1>{}([&](auto i) {
move_tensor_coordinate(src_descs[i],
src_coords_(i),
make_tensor_coordinate_step(src_descs[i], forward_step));
});
static_for<0, nDst, 1>{}([&](auto i) {
move_tensor_coordinate(dst_descs[i],
dst_coords_(i),
make_tensor_coordinate_step(dst_descs[i], forward_step));
});
}
});
// move coordinate back to slice origin (or not)
static_for<0, nSrc, 1>{}([&](auto i) {
if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
{
const auto src_reset_step =
make_tensor_coordinate_step(src_descs[i], GetCoordinateResetStep());
move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
}
});
static_for<0, nDst, 1>{}([&](auto i) {
if constexpr(DstResetCoordinateAfterRunFlags::At(i))
{
const auto dst_reset_step =
make_tensor_coordinate_step(dst_descs[i], GetCoordinateResetStep());
move_tensor_coordinate(dst_descs[i], dst_coords_(i), dst_reset_step);
}
});
}
__device__ static constexpr auto GetCoordinateResetStep()
{
constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
if constexpr(num_access == 0)
{
return typename SpaceFillingCurve::Index{};
}
else
{
constexpr auto reset_step =
SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
return reset_step;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template <index_t ISrc>
__device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
Number<ISrc> iSrc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const auto adjusted_step_idx = SrcResetCoordinateAfterRunFlags::At(iSrc)
? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(src_descs[iSrc], adjusted_step_idx);
move_tensor_coordinate(src_descs[iSrc], src_coords_(iSrc), adjusted_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
template <index_t IDst>
__device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
Number<IDst> iDst,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx = DstResetCoordinateAfterRunFlags::At(iDst)
? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step = make_tensor_coordinate_step(dst_descs[iDst], adjusted_step_idx);
move_tensor_coordinate(dst_descs[iDst], dst_coords_(iDst), adjusted_step);
}
private:
SrcCoords src_coords_;
DstCoords dst_coords_;
const ElementwiseOperation element_op_;
};
} // namespace ck
#ifndef CK_XDLOPS_GEMM_HPP // SPDX-License-Identifier: MIT
#define CK_XDLOPS_GEMM_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_header.hpp" #pragma once
#include "math.hpp"
#include "amd_xdlops.hpp" #include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/amd_xdlops.hpp"
namespace ck { namespace ck {
...@@ -786,4 +788,3 @@ struct XdlopsGemm ...@@ -786,4 +788,3 @@ struct XdlopsGemm
}; };
} // namespace ck } // namespace ck
#endif
#ifndef CK_AMD_ADDRESS_SPACE_HPP // SPDX-License-Identifier: MIT
#define CK_AMD_ADDRESS_SPACE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "config.hpp" #pragma once
#include "ck/ck.hpp"
#include "c_style_pointer_cast.hpp" #include "c_style_pointer_cast.hpp"
// Address Space for AMDGCN // Address Space for AMDGCN
...@@ -41,4 +43,3 @@ __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_addres ...@@ -41,4 +43,3 @@ __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_addres
} }
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "data_type.hpp" #include "data_type.hpp"
...@@ -7,6 +10,8 @@ namespace ck { ...@@ -7,6 +10,8 @@ namespace ck {
template <typename T> template <typename T>
union BufferResource union BufferResource
{ {
__device__ constexpr BufferResource() : content{} {}
// 128 bit SGPRs to supply buffer resource in buffer instructions // 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t content; int32x4_t content;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_INLINE_ASM_HPP #ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP #define CK_AMD_INLINE_ASM_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_LLVM_INTRINSIC_HPP #ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP #define CK_AMD_LLVM_INTRINSIC_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_AMD_XDLOPS_HPP #ifndef CK_AMD_XDLOPS_HPP
#define CK_AMD_XDLOPS_HPP #define CK_AMD_XDLOPS_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_ARRAY_HPP #ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP #define CK_ARRAY_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_ARRAY_MULTI_INDEX_HPP #ifndef CK_ARRAY_MULTI_INDEX_HPP
#define CK_ARRAY_MULTI_INDEX_HPP #define CK_ARRAY_MULTI_INDEX_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_C_STYLE_POINTER_CAST_HPP #ifndef CK_C_STYLE_POINTER_CAST_HPP
#define CK_C_STYLE_POINTER_CAST_HPP #define CK_C_STYLE_POINTER_CAST_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include "config.hpp" #include "config.hpp"
#include "array.hpp" #include "array.hpp"
...@@ -31,19 +34,49 @@ ...@@ -31,19 +34,49 @@
#include "thread_group.hpp" #include "thread_group.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "amd_buffer_addressing.hpp" #include "ck/ck.hpp"
#include "generic_memory_space_atomic.hpp" #include "ck/utility/array.hpp"
#include "get_id.hpp" #include "ck/utility/container_helper.hpp"
#include "synchronization.hpp" #include "ck/utility/statically_indexed_array.hpp"
#include "amd_address_space.hpp" #include "ck/utility/container_element_picker.hpp"
#include "static_buffer.hpp" #include "ck/utility/multi_index.hpp"
#include "dynamic_buffer.hpp" #include "ck/utility/data_type.hpp"
#include "ck/utility/functional.hpp"
#include "ck/utility/functional2.hpp"
#include "ck/utility/functional3.hpp"
#include "ck/utility/functional4.hpp"
#include "ck/utility/enable_if.hpp"
#include "ck/utility/ignore.hpp"
#include "ck/utility/integral_constant.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/type.hpp"
#include "ck/utility/magic_division.hpp"
#include "ck/utility/c_style_pointer_cast.hpp"
#include "ck/utility/is_known_at_compile_time.hpp"
#include "ck/utility/transpose_vectors.hpp"
#include "ck/utility/inner_product.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/utility/debug.hpp"
#include "ck/utility/amd_buffer_addressing.hpp"
#include "ck/utility/generic_memory_space_atomic.hpp"
#include "ck/utility/get_id.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/utility/synchronization.hpp"
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/static_buffer.hpp"
#include "ck/utility/dynamic_buffer.hpp"
// TODO: remove this // TODO: remove this
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "ck/utility/amd_inline_asm.hpp"
#endif #endif
#ifdef CK_USE_AMD_MFMA #ifdef CK_USE_AMD_MFMA
#include "amd_xdlops.hpp" #include "ck/utility/amd_xdlops.hpp"
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment