"graphbolt/src/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "1e34f6648a61be5a337cd49ff1becf1b0d2c0f41"
Commit 3cb2a7d0 authored by Chao Liu's avatar Chao Liu
Browse files

removing old implementation of tensor descriptor

parent 39d92e7d
...@@ -431,9 +431,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -431,9 +431,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global, b_thread_data_on_global,
0}) 0})
#if 1 #if 1
.template Run_generic<Float, address_space_t::generic, address_space_t::global> .template Run_generic<Float, Float, address_space_t::generic, address_space_t::global>
#elif 1 #elif 1
.template Run_optimized_dst_address_calculation<Float, address_space_t::global> .template Run_optimized_dst_address_calculation<Float, Float, address_space_t::global>
#endif #endif
(p_out_thread, p_out_global); (p_out_thread, p_out_global);
} }
......
...@@ -734,43 +734,46 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -734,43 +734,46 @@ struct BlockwiseGenericTensorSliceCopy_v4
return RegisterBufferDesc::GetElementSpace(); return RegisterBufferDesc::GetElementSpace();
} }
template <typename TData, address_space_t SrcAddressSpace = address_space_t::generic> template <typename SrcData, typename BufferData, address_space_t SrcAddressSpace = address_space_t::generic>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const __device__ void RunLoadRegisterBuffer(const SrcData* p_src, BufferData* p_buffer) const
{ {
#if 1 #if 1
mThreadwiseLoad.template Run_generic<TData, SrcAddressSpace, address_space_t::generic>( mThreadwiseLoad.template Run_generic<SrcData, BufferData, SrcAddressSpace, address_space_t::generic>(
p_src, p_buffer); p_src, p_buffer);
#else #else
mThreadwiseLoad.template Run_optimized_src_address_calculation<TData, mThreadwiseLoad.template Run_optimized_src_address_calculation<SrcData,
BufferData,
SrcAddressSpace, SrcAddressSpace,
address_space_t::generic>( address_space_t::generic>(
p_src, p_buffer); p_src, p_buffer);
#endif #endif
} }
template <typename TData, address_space_t DstAddressSpace = address_space_t::generic> template <typename BufferData, typename DstData, address_space_t DstAddressSpace = address_space_t::generic>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const __device__ void RunStoreRegisterBuffer(const BufferData* p_buffer, DstData* p_dst) const
{ {
#if 1 #if 1
mThreadwiseStore.template Run_generic<TData, address_space_t::generic, DstAddressSpace>( mThreadwiseStore.template Run_generic<BufferData, DstData, address_space_t::generic, DstAddressSpace>(
p_buffer, p_dst); p_buffer, p_dst);
#else #else
mThreadwiseStore.template Run_optimized_dst_address_calculation<TData, mThreadwiseStore.template Run_optimized_dst_address_calculation<BufferData,
DstData,
address_space_t::generic, address_space_t::generic,
DstAddressSpace>(p_buffer, DstAddressSpace>(p_buffer,
p_dst); p_dst);
#endif #endif
} }
template <typename TData, template <typename SrcData,
typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run(const TData* p_src, TData* p_dst) const __device__ void Run(const SrcData* p_src, DstData* p_dst) const
{ {
TData p_buffer[GetRegisterBufferSize()]; SrcData p_src_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer<TData, SrcAddressSpace>(p_src, p_buffer); RunLoadRegisterBuffer<SrcData, SrcData, SrcAddressSpace>(p_src, p_buffer);
RunStoreRegisterBuffer<TData, DstAddressSpace>(p_buffer, p_dst); RunStoreRegisterBuffer<SrcData, DstData, DstAddressSpace>(p_buffer, p_dst);
} }
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
......
...@@ -1179,13 +1179,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1179,13 +1179,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area. // Will do padding check on dst data: No write if dst data is in paddin area.
template <typename TData, template <typename SrcData,
typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_generic(const TData* p_src, TData* p_dst) const __device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const
{ {
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{}; constexpr auto vector_access_dim = Number<VectorAccessDim>{};
...@@ -1205,13 +1206,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1205,13 +1206,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_data_begin_id(vector_access_dim) = long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim]; long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector // buffer to hold a src long-vector
TData p_long_vector[long_vector_size]; SrcData p_src_long_vector[long_vector_size];
// zero out buffer // zero out buffer
for(index_t i = 0; i < long_vector_size; ++i) for(index_t i = 0; i < long_vector_size; ++i)
{ {
p_long_vector[i] = 0; p_src_long_vector[i] = 0;
} }
// load data from src to the long-vector buffer // load data from src to the long-vector buffer
...@@ -1231,20 +1232,28 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1231,20 +1232,28 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) { static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<TData, SrcDataPerAccess>(p_src, src_coord.GetOffset(), 0); __buffer_load<SrcData, SrcDataPerAccess>(p_src, src_coord.GetOffset(), 0);
#else #else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// src can be all kinds of memory-space. // src can be all kinds of memory-space.
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
}); });
} }
} }
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst // store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{ {
...@@ -1262,19 +1271,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1262,19 +1271,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) { static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
dst_coord.GetOffset(), dst_coord.GetOffset(),
0); 0);
#else #else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// dst can be all kinds of memory-space // dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
}); });
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment