Commit 6c2c50b0 authored by Chao Liu's avatar Chao Liu
Browse files

done: explicitly separate offset component into compile-time, block-invariant...

done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting
parent 51884fc2
......@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0,
b_thread_data_on_global,
0})
.template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
#if 0
.Run_generic
#elif 1
.template Run_generic<Float, address_space_t::generic, address_space_t::global>
#elif 1
.template Run_optimized_dst_address_calculation<Float, address_space_t::global>
#endif
(p_out_thread, p_out_global);
}
}
};
......
......@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run_generic(p_src, p_buffer);
#elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
#elif 0
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
// hardcoded: src is global memory
mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
#elif 1
// hardcoded: src is global memory
mThreadwiseLoad
.template Run_optimized_src_address_calculation<TData, address_space_t::global>(
p_src, p_buffer);
#endif
}
......@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run_generic(p_buffer, p_dst);
#elif 1
// hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
// hardcoded: dst is lds
mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
#elif 1
// hardcoded: dst is lds
mThreadwiseStore
.template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
p_dst);
#endif
}
......
......@@ -21,10 +21,6 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace ck {
// This threadwise copy allow vector access of src and dst.
......@@ -36,11 +32,11 @@ namespace ck {
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
......@@ -114,7 +110,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
mDstSliceOrigin = dst_slice_origin;
}
template <class TData>
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
......@@ -262,10 +258,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename DimAccessOrder,
index_t VectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
......@@ -328,7 +324,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
mDstSliceOrigin = dst_slice_origin;
}
template <class TData>
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
......@@ -443,11 +439,11 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
......@@ -526,17 +522,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
mDstSliceOrigin = dst_slice_origin;
}
template <class TDesc, class Lengths>
template <typename TDesc, class Lengths>
struct IsolateMergedDimLengths
{
template <class IDim>
template <typename IDim>
__device__ constexpr index_t operator()(IDim idim) const
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
}
};
template <class TData>
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
......@@ -765,7 +761,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 0: VGPR
// 1: LDS
// 2: global-memory
template <class TData, index_t SrcMemorySpace, index_t DstMemorySpace>
template <typename TData, index_t SrcMemorySpace, index_t DstMemorySpace>
__device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
......@@ -839,8 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcMemorySpace == 2>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && \
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data = __buffer_load<TData, SrcDataPerAccess>(
p_src,
static_cast<uint32_t>(src_merged_offset),
......@@ -940,8 +935,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstMemorySpace == 2>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && \
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#else
......@@ -959,7 +953,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}
// T can be Sequence or Array
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
......@@ -967,7 +961,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
......@@ -981,11 +975,11 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
};
// this version use TensorView and TensorCoordinate
template <class SrcTensor,
class DstTensor,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
template <typename SrcTensor,
typename DstTensor,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
......@@ -1105,13 +1099,13 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
}
// T can be Sequence or Array
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant<bool, PositiveDirection>{});
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant<bool, PositiveDirection>{});
......@@ -1187,8 +1181,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
mDstSliceOrigin = dst_slice_origin;
}
template <class TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
template <typename TData,
address_space_t SrcAddressSpace = address_space_t::generic,
address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_generic(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
......@@ -1214,7 +1212,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// set 0
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_long_vector[i] = 0;
......@@ -1226,18 +1224,29 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// check for padding
// TODO: still kind of messy
// Check src vector's padding situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector has
// the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary
if(!src_coord.IsAnyLevelIndexInPaddingArea())
{
const index_t src_offset = src_coord.GetOffset();
const index_t buffer_offset = i * src_data_per_access;
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
__buffer_load<TData, SrcDataPerAccess>(p_src, src_coord.GetOffset(), 0);
#else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#endif
}).Else([&](auto) {
// src can be all kinds of memory-space.
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
});
}
}
......@@ -1249,24 +1258,53 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset =
(mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
// Check dst vector's padding situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector has
// the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary
#if 0 // tuning
if(!dst_coord.IsAnyLevelIndexInPaddingArea())
#endif
{
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]),
p_dst,
dst_coord.GetOffset(),
0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
#endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
});
}
}
});
}
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
return Sequence<(Mask ? Lengths : 1)...>{};
}
template <class TData>
__device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src,
TData* p_dst) const
// p_src must be global-memory, p_dst can be any memory-space.
// User should make sure p_src is a block-invariant pointer, because
// buffer_load is used for loading from global-memory into register buffer.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
template <typename TData, address_space_t SrcAddressSpace = address_space_t::generic>
__device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
......@@ -1281,11 +1319,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// TODO:: don't use hack
// TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask()
// is implemented
constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{};
constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
// separate steps into linear and non-linear components
static_assert(
src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance would drop");
// separate steps into linear and non-linear components, accoording to src tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
......@@ -1293,20 +1336,21 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}(
[&](auto nonlinear_dim_long_vector_access_id) {
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// step-sizes along src's nonlinear dimensions
// calculate step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move src cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}(
[&](auto linear_dim_long_vector_access_id) {
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
......@@ -1316,36 +1360,57 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// set 0
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_long_vector[i] = 0;
}
// load data from src to the long-vector buffer
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// TODO: good implementation?
const index_t src_linear_offset_diff =
// this is src compile-time offset
// TODO: is this good implementation?
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
// check for padding
// TODO: still kind of messy
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
// neccessary
if(!src_coord.IsAnyLevelIndexInPaddingArea())
{
const index_t src_offset = src_coord.GetOffset();
const index_t buffer_offset = i * src_data_per_access;
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
__buffer_load<TData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
#endif
}).Else([&](auto) {
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
});
}
}
......@@ -1357,24 +1422,36 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset =
(mDstSliceOrigin +
(nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id))
.GetOffset();
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
// dst offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
// neccessary
#if 0 // tuning
if(!dst_coord.IsAnyLevelIndexInPaddingArea())
#endif
{
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
}
}
});
});
}
// memory-space
// 0: VGPR
// 1: LDS
// 2: global-memory
template <class TData, index_t SrcMemorySpace, index_t DstMemorySpace>
__device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const
// p_src could be any memory space, d_dst must be global memory.
// User should make sure p_dst is a block-invariant pointer, because
// buffer_load is used for storing data from regsiter buffer into global-memory.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
template <typename TData, address_space_t DstAddressSpace = address_space_t::generic>
__device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
......@@ -1389,54 +1466,81 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) {
// TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask()
// is implemented
constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{};
constexpr auto dst_nonlinear_dim_mask = DstNonLinearDimensionMask{};
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
static_assert(
dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance would drop");
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
// loop over dst's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// calculate step-sizes along dst's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move dst cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
// loop over dst's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along dst's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
TData p_long_vector[long_vector_size];
// set 0
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_long_vector[i] = 0;
}
// load data from src to the long-vector buffer
// Loop over VectorAccessDim, and load data from src to the
// long-vector buffer.
// If VectorAccessDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// VectorAccessDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, VectorAccessDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
const index_t buffer_offset = i * src_data_per_access;
// check for padding
// TODO: still kind of messy
// src offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
// the src vector has the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
// neccessary
if(!src_coord.IsAnyLevelIndexInPaddingArea())
{
const index_t src_offset = src_coord.GetOffset();
const index_t buffer_offset = i * src_data_per_access;
static_if<SrcMemorySpace == 2>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && \
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
__buffer_load<TData, SrcDataPerAccess>(
p_src, static_cast<uint32_t>(src_offset), static_cast<uint32_t>(0));
#else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
#endif
}).Else([&](auto) {
// src can be all kinds of memory-space.
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
});
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
}
}
......@@ -1448,31 +1552,48 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const index_t buffer_offset = i * dst_data_per_access;
const index_t dst_offset =
(mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
static_if<DstMemorySpace == 2>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && \
CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// move dst cooridnate along linear dimensions
const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is dst compile-time offset
// TODO: is this good implementation?
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
// the dst vector has the same padding situation
// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
// neccessary
#if 0 // tuning
if(!dst_coord.IsAnyLevelIndexInPaddingArea())
#endif
{
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store<TData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]),
p_dst,
dst_offset,
0);
dst_nonlinear_coord.GetOffset(),
dst_linear_offset);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
#endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
});
}
}
});
});
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
......@@ -1483,7 +1604,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
......
......@@ -22,7 +22,7 @@
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_INTRINCIS
#if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp"
#endif
......
......@@ -8,7 +8,7 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -16,6 +16,14 @@
namespace ck {
enum address_space_t
{
generic = 0,
vgpr = 1,
lds = 2,
global = 3
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
......@@ -10,7 +10,7 @@
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -18,6 +18,11 @@
namespace ck {
enum address_space_t
{
generic = 0
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment