Commit ff2c373b authored by Chao Liu's avatar Chao Liu
Browse files

amd build

parent e5874b3f
...@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto ...@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
template <typename... Ts> template <typename... Ts>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>) make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
using TDesc = ConstantTensorDescriptor_deprecated<Ts...>; using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
......
...@@ -228,7 +228,7 @@ struct TensorCoordinate ...@@ -228,7 +228,7 @@ struct TensorCoordinate
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -236,7 +236,7 @@ struct TensorCoordinate ...@@ -236,7 +236,7 @@ struct TensorCoordinate
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated ...@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
{ {
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>(); return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
{ {
return MergedTensorCoordinate_deprecated< return MergedTensorCoordinate_deprecated<
ConstantMergedTensorDescriptor_deprecated<Ts...>>(); ConstantMergedTensorDescriptor_deprecated<Ts...>>();
......
...@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor, ...@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor,
index_t... LowerDimensionIds, index_t... LowerDimensionIds,
index_t... UpperDimensionIds> index_t... UpperDimensionIds>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
Sequence<LowerLengths...>, Sequence<LowerLengths...>,
Sequence<LowerDimensionIds...>, Sequence<LowerDimensionIds...>,
Sequence<UpperDimensionIds...>) Sequence<UpperDimensionIds...>)
{ {
return TransformedTensorDescriptor<LowerTensorDescriptor, return TransformedTensorDescriptor<LowerTensorDescriptor,
Tuple<PassThrough<LowerLengths>...>, Tuple<PassThrough<LowerLengths>...>,
...@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, ...@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
// reorder a NativeTensorDescriptor // reorder a NativeTensorDescriptor
template <typename... Ts, typename MapLower2Upper> template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper) reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
{ {
static_assert(is_valid_sequence_map<MapLower2Upper>{}, static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map"); "wrong! MapLower2Upper is not a valid map");
...@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo ...@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo
// reorder a TransformedTensorDescriptor // reorder a TransformedTensorDescriptor
template <typename... Ts, typename MapLower2Upper> template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper) reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
{ {
static_assert(is_valid_sequence_map<MapLower2Upper>{}, static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map"); "wrong! MapLower2Upper is not a valid map");
......
...@@ -166,28 +166,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -166,28 +166,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation // has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
#if 0 #if 0 // debug
static_if<!DoAtomicAdd>{}([&](auto) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>( amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst), fwd(p_dst),
dst_coord.GetOffset(), dst_coord.GetOffset(),
0); 0);
#else #else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif #endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}).Else([&](auto) { }).Else([&](auto) {
atomicAdd( // dst can be all kinds of memory-space
reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]), *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset])); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}); });
#else #else
move_data<DstData, move_data<DstData,
......
...@@ -55,9 +55,16 @@ enum AddressSpace ...@@ -55,9 +55,16 @@ enum AddressSpace
{ {
generic, generic,
global, global,
lds,
vgpr vgpr
}; };
enum InMemoryDataOperation
{
none,
atomic_add
};
#if CK_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t; using index_t = uint32_t;
#else #else
......
...@@ -307,58 +307,5 @@ struct inner_product_with_conversion ...@@ -307,58 +307,5 @@ struct inner_product_with_conversion
} }
}; };
template <DataMovement Movement, AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void move_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<DataMovement, Movement>,
integral_constant<AddressSpace, SrcAddressSpace> src_address_space,
integral_constant<AddressSpace, DstAddressSpace> dst_address_space)
{
// TODO: use static_if::ElseIf
static_if<Movement == DataMovement::copy>{}([&](auto) {
copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
static_if<Movement == DataMovement::atomic_add>{}([&](auto) {
atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void copy_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0);
#else
p_dst[dst_offset] = p_src[src_offset];
#endif
})
.Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; });
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void atomic_add_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); })
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -64,9 +64,8 @@ struct static_if<true> ...@@ -64,9 +64,8 @@ struct static_if<true>
} }
template <typename F> template <typename F>
__host__ __device__ static constexpr auto Else(F) __host__ __device__ static void Else(F)
{ {
return Type{};
} }
}; };
...@@ -82,14 +81,13 @@ struct static_if<false> ...@@ -82,14 +81,13 @@ struct static_if<false>
} }
template <typename F> template <typename F>
__host__ __device__ static constexpr auto Else(F f) __host__ __device__ static void Else(F f)
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being // this will make "f" a generic lambda, so that "f" won't be compiled until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
return Type{};
} }
}; };
......
#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
#define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#endif
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr &&
DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
...@@ -23,14 +23,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in ...@@ -23,14 +23,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}( static_if<SrcAddressSpace == AddressSpace::vgpr &&
[&](auto) { DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]), atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset])); *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}) }).Else([&](auto fwd) {
.Else([&](auto fwd) { static_assert(fwd(false), "atomic_add doesn't support this memory space");
static_assert(fwd(false), "atomic_add doesn't support this memory space"); });
});
} }
template <typename T, template <typename T,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment