Commit ff2c373b authored by Chao Liu's avatar Chao Liu
Browse files

amd build

parent e5874b3f
......@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
template <typename... Ts>
__host__ __device__ constexpr auto
make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
{
using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
......
......@@ -64,7 +64,7 @@ template <typename LowerTensorDescriptor,
index_t... LowerDimensionIds,
index_t... UpperDimensionIds>
__host__ __device__ constexpr auto
reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
Sequence<LowerLengths...>,
Sequence<LowerDimensionIds...>,
Sequence<UpperDimensionIds...>)
......@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
// reorder a NativeTensorDescriptor
template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
{
static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map");
......@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo
// reorder a TransformedTensorDescriptor
template <typename... Ts, typename MapLower2Upper>
__host__ __device__ constexpr auto
reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
{
static_assert(is_valid_sequence_map<MapLower2Upper>{},
"wrong! MapLower2Upper is not a valid map");
......
......@@ -166,8 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset())
{
#if 0
static_if<!DoAtomicAdd>{}([&](auto) {
#if 0 // debug
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
......@@ -184,11 +183,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}).Else([&](auto) {
atomicAdd(
reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]),
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]));
});
#else
move_data<DstData,
DstDataPerAccess,
......
......@@ -55,9 +55,16 @@ enum AddressSpace
{
generic,
global,
lds,
vgpr
};
enum InMemoryDataOperation
{
none,
atomic_add
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
......@@ -307,58 +307,5 @@ struct inner_product_with_conversion
}
};
template <DataMovement Movement, AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void move_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<DataMovement, Movement>,
integral_constant<AddressSpace, SrcAddressSpace> src_address_space,
integral_constant<AddressSpace, DstAddressSpace> dst_address_space)
{
// TODO: use static_if::ElseIf
static_if<Movement == DataMovement::copy>{}([&](auto) {
copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
static_if<Movement == DataMovement::atomic_add>{}([&](auto) {
atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space);
});
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void copy_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0);
#else
p_dst[dst_offset] = p_src[src_offset];
#endif
})
.Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; });
}
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
void atomic_add_data(const float* p_src,
index_t src_offset,
float* p_dst,
dst_offset,
integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>)
{
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); })
.Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
} // namespace ck
#endif
......@@ -64,9 +64,8 @@ struct static_if<true>
}
template <typename F>
__host__ __device__ static constexpr auto Else(F)
__host__ __device__ static void Else(F)
{
return Type{};
}
};
......@@ -82,14 +81,13 @@ struct static_if<false>
}
template <typename F>
__host__ __device__ static constexpr auto Else(F f)
__host__ __device__ static void Else(F f)
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being
// instantiated here
f(forwarder{});
return Type{};
}
};
......
#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP
#define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#endif
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr &&
DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::none ||
DstInMemOp == InMemoryDataOperation::atomic_add,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
} // namespace ck
#endif
......@@ -23,12 +23,11 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
[&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr &&
DstAddressSpace == AddressSpace::global>{}([&](auto) {
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
})
.Else([&](auto fwd) {
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment