"...gpu-base-image-build.git" did not exist on "b073e39e11b253fed15349c9f5a404656181f290"
Commit 00089cd6 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent b12bbcee
...@@ -396,14 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -396,14 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0, 0,
b_thread_data_on_global, b_thread_data_on_global,
0}) 0})
#if 0 .template Run<Float, address_space_t::generic, address_space_t::global>(
.Run p_out_thread, p_out_global);
#else // tweaking
.template Run_optimized_address_calculation<Float,
address_space_t::generic,
address_space_t::global>
#endif
(p_out_thread, p_out_global);
} }
} }
}; };
......
...@@ -427,10 +427,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -427,10 +427,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global, b_thread_data_on_global,
0}) 0})
#if 1 #if 1
.template Run_generic<Float, .template Run<Float, Float, address_space_t::generic, address_space_t::global>
Float,
address_space_t::generic,
address_space_t::global>
#else // tweaking #else // tweaking
.template Run_optimized_dst_address_calculation<Float, .template Run_optimized_dst_address_calculation<Float,
Float, Float,
......
...@@ -391,14 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer ...@@ -391,14 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
{ {
threadwise_out_copy threadwise_out_copy
#if 1 .template Run<Float, address_space_t::generic, address_space_t::global>(
.Run p_out_thread, p_out_global);
#else // tweaking
.template Run_optimized_address_calculation<Float,
address_space_t::generic,
address_space_t::global>
#endif
(p_out_thread, p_out_global);
threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
......
...@@ -390,10 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -390,10 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global / B1, b_thread_data_on_global / B1,
b_thread_data_on_global % B1}) b_thread_data_on_global % B1})
#if 1 #if 1
.template Run_generic<Float, .template Run<Float, Float, address_space_t::generic, address_space_t::global>
Float,
address_space_t::generic,
address_space_t::global>
#else // tweaking #else // tweaking
.template Run_optimized_dst_address_calculation<Float, .template Run_optimized_dst_address_calculation<Float,
Float, Float,
......
...@@ -73,8 +73,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -73,8 +73,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const ThreadBufferData* p_thread_buffer) const
{ {
#if 1 #if 0
mThreadwiseLoad.template Run_generic<BlockSrcData, mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData, ThreadBufferData,
BlockSrcAddressSpace, BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src, ThreadBufferAddressSpace>(p_block_src,
...@@ -95,8 +95,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -95,8 +95,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const BlockDstData* p_block_dst) const
{ {
#if 1 #if 0
mThreadwiseStore.template Run_generic<ThreadBufferData, mThreadwiseStore.template Run<ThreadBufferData,
BlockDstData, BlockDstData,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
BlockDstAddressSpace>(p_thread_buffer, p_block_dst); BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
......
...@@ -78,7 +78,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -78,7 +78,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
typename DstData, typename DstData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const __device__ void Run(const SrcData* p_src, DstData* p_dst) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( __buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_coord.GetOffset(), 0); p_src, 0, src_coord.GetOffset());
#else #else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
...@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__buffer_store<DstData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
dst_coord.GetOffset(), 0,
0); dst_coord.GetOffset());
#else #else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
......
...@@ -538,235 +538,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -538,235 +538,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
} }
}; };
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
constexpr auto src_normal_dim_access_lengths =
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id_) {
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
src_vector_access_dim,
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
const TData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id_) {
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
src_vector_access_dim,
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
constexpr index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
// load vector from src
const src_vector_t vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
src_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
});
});
});
#else
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_merged_dim_access_id) {
auto src_merged_dim_data_id = src_merged_dim_access_id;
src_merged_dim_data_id(src_vector_access_dim) =
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
const TData* p_src_tmp =
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
// these should be compile-time known
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
auto src_normal_dim_access_id) {
auto src_normal_dim_data_id = src_normal_dim_access_id;
src_normal_dim_data_id(src_vector_access_dim) =
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
// load vector from src
const src_vector_t vector_data =
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
});
#endif
}
// copy data from buffer into dst
{
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto dst_access_lengths = SliceLengths::Modify(
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
nDim,
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
constexpr auto dst_normal_dim_access_lengths =
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id_) {
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
dst_vector_access_dim,
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id_) {
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
dst_vector_access_dim,
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
dst_vector_t vector_data;
// pack vector from buffer
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
dst_vector_access_dim, i);
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
});
constexpr index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
});
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_merged_dim_access_id) {
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
dst_merged_dim_data_id(dst_vector_access_dim) =
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
// offset w.r.t. normal dimension can be computed at compile-time
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
auto dst_normal_dim_access_id) {
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
dst_normal_dim_data_id(dst_vector_access_dim) =
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
dst_vector_t vector_data;
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// write vector into dst
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
});
});
#endif
}
}
template <typename TData, template <typename TData,
address_space_t SrcAddressSpace = address_space_t::generic, address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic> address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_optimized_address_calculation(const TData* p_src, TData* p_dst) const __device__ void Run(const TData* p_src, TData* p_dst) const
{ {
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
...@@ -841,9 +616,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -841,9 +616,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) { static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data = __buffer_load<TData, SrcDataPerAccess>( vector_data = __buffer_load<TData, SrcDataPerAccess>(
p_src, p_src, src_merged_offset, src_normal_offset);
static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset));
#else #else
vector_data = *reinterpret_cast<const src_vector_t*>( vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]); &p_src[src_normal_offset + src_merged_offset]);
......
...@@ -448,7 +448,7 @@ int main(int argc, char* argv[]) ...@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -490,7 +490,7 @@ int main(int argc, char* argv[]) ...@@ -490,7 +490,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment