Commit 8e5085fa authored by Chao Liu's avatar Chao Liu
Browse files

fix scratch memory issue

parent b50fa980
......@@ -96,11 +96,14 @@ template <index_t BlockSize,
index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
{
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
__device__ void Run_r1(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -117,53 +120,52 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
#if 1
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
#else
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>
#elif 1
BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>
#endif
col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
(col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
......@@ -173,6 +175,80 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
}
}
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <typename... Col, typename... Img>
__device__ void Run_r2(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(col_gemmk_gemmn_global_desc,
p_col_global,
img_gemmk_gemmn_global_desc,
p_img_global);
blockwise_copy.MoveSrcSliceWindow(col_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(img_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
Run_r2(
p_col_global, p_img_global, col_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc);
}
};
} // namespace ck
......
......@@ -20,12 +20,11 @@ template <index_t BlockSize,
index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCopy_gemmkgemmn
{
#if 1
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
__device__ void Run_r1(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -43,50 +42,50 @@ struct DynamicGridwiseCopy_gemmkgemmn
// blockwise atomic accumulation
auto blockwise_copy =
#if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
#else
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>
#elif 1
BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>
#endif
src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
(src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
......@@ -96,12 +95,12 @@ struct DynamicGridwiseCopy_gemmkgemmn
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
}
}
#else
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
__device__ void Run_r2(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -114,53 +113,58 @@ struct DynamicGridwiseCopy_gemmkgemmn
const index_t block_work_id = get_block_1d_id();
// divide thread work by GemmK, GemmN
static constexpr auto thread_cluster_desc = make_cluster_descriptor(
BlockCopyClusterLengths_GemmK_GemmN{}, BlockCopyThreadClusterArrangeOrder{});
const auto thread_work_id =
thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
// gemmk, gemmn
constexpr index_t GemmKPerThread = BlockCopySubLengths_GemmK_GemmN::At(I0);
constexpr index_t GemmNPerThread = BlockCopySubLengths_GemmK_GemmN::At(I1);
const index_t gemmk_thread_data_on_global =
thread_work_id[I0] * BlockCopySubLengths_GemmK_GemmN::At(I0);
const index_t gemmn_thread_data_on_global =
block_work_id * GemmNPerBlock +
thread_work_id[I1] * BlockCopySubLengths_GemmK_GemmN::At(I1);
auto src_coord = make_dynamic_tensor_coordinate(
src_gemmk_gemmn_global_desc,
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global));
auto dst_coord = make_dynamic_tensor_coordinate(
dst_gemmk_gemmn_global_desc,
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global));
threadwise_dynamic_tensor_slice_transfer_v1<float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
BlockCopySubLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
1,
1,
1,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(src_gemmk_gemmn_global_desc,
src_coord,
p_src_global,
dst_gemmk_gemmn_global_desc,
dst_coord,
p_dst_global);
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(src_gemmk_gemmn_global_desc,
p_src_global,
dst_gemmk_gemmn_global_desc,
p_dst_global);
blockwise_copy.MoveSrcSliceWindow(src_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(dst_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
Run_r2(
p_src_global, p_dst_global, src_gemmk_gemmn_global_desc, dst_gemmk_gemmn_global_desc);
}
#endif
};
} // namespace ck
......
......@@ -9,6 +9,7 @@
namespace ck {
// this version does not have scratch memory issue, which is good, but I don't know why
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
......@@ -27,17 +28,18 @@ template <index_t BlockSize,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v1
struct BlockwiseDynamicTensorSliceTransfer_v1r1
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1r1(
const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_transfer_(block_src_desc,
make_zero_multi_index<nDim>(),
block_dst_desc,
......@@ -103,22 +105,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc,
BlockDstDesc,
ThreadSliceLengths,
SrcDstDimAccessOrder,
SrcDstVectoReadDim,
SrcDataPerRead,
DstDataPerWrite,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcDataStride,
DstDataStride>;
using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1r1<BlockSrcDesc,
BlockDstDesc,
ThreadSliceLengths,
SrcDstDimAccessOrder,
SrcDstVectoReadDim,
SrcDataPerRead,
DstDataPerWrite,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcDataStride,
DstDataStride>;
ThreadwiseTransfer threadwise_transfer_;
};
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
......@@ -139,17 +144,18 @@ template <index_t BlockSize,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v2
struct BlockwiseDynamicTensorSliceTransfer_v2r1
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2(const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2r1(
const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_read_(block_src_desc,
make_zero_multi_index<nDim>(),
thread_buffer_desc_,
......@@ -246,31 +252,206 @@ struct BlockwiseDynamicTensorSliceTransfer_v2
static constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{}));
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc,
decltype(thread_buffer_desc_),
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectorReadDim,
SrcDataPerRead,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1r1<BlockSrcDesc,
decltype(thread_buffer_desc_),
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectorReadDim,
SrcDataPerRead,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r1<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_;
static constexpr index_t thread_buffer_element_size_ =
thread_buffer_desc_.GetElementSpaceSize();
BlockSrcData p_thread_buffer_[thread_buffer_element_size_];
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorReadDim,
index_t DstVectorWriteDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v2r2
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2r2(
const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_read_(block_src_desc,
make_zero_multi_index<nDim>(),
thread_buffer_desc_,
make_zero_multi_index<nDim>()),
threadwise_write_(thread_buffer_desc_,
make_zero_multi_index<nDim>(),
block_dst_desc,
make_zero_multi_index<nDim>())
{
static_assert(
nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<BlockDstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() && nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_read_.SetSrcSliceOrigin(block_src_desc,
src_block_slice_origin + thread_data_id_begin);
threadwise_read_.SetDstSliceOrigin(thread_buffer_desc_, make_zero_multi_index<nDim>());
threadwise_write_.SetSrcSliceOrigin(thread_buffer_desc_, make_zero_multi_index<nDim>());
threadwise_write_.SetDstSliceOrigin(block_dst_desc,
dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ void RunRead(const BlockSrcDesc& block_src_desc, const BlockSrcData* p_block_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.Run(
block_src_desc, p_block_src, thread_buffer_desc_, p_thread_buffer_);
}
}
__device__ void RunWrite(const BlockDstDesc& block_dst_desc, BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.Run(
thread_buffer_desc_, p_thread_buffer_, block_dst_desc, p_block_dst);
}
}
__device__ void Run(const BlockSrcDesc& block_src_desc,
const BlockSrcData* p_block_src,
const BlockDstDesc& block_dst_desc,
BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.Run(
block_src_desc, p_block_src, thread_buffer_desc_, p_thread_buffer_);
// if there is type conversion, it's done during write
threadwise_write_.Run(
thread_buffer_desc_, p_thread_buffer_, block_dst_desc, p_block_dst);
}
}
__device__ void MoveSrcSliceWindow(const BlockSrcDesc& block_src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.MoveSrcSliceWindow(block_src_desc, step);
}
}
__device__ void MoveDstSliceWindow(const BlockDstDesc& block_dst_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.MoveDstSliceWindow(block_dst_desc, step);
}
}
private:
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{}));
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1r2<BlockSrcDesc,
decltype(thread_buffer_desc_),
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectorReadDim,
SrcDataPerRead,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_;
......
......@@ -7,6 +7,8 @@
namespace ck {
// threadwise_dynamic_tensor_slice_transfer_v1r1 has scratch memory issue, due to
// it constructs new tensor coordinate
template <typename SrcData,
typename DstData,
typename SrcDesc,
......@@ -21,7 +23,7 @@ template <typename SrcData,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
__host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1r1(
const SrcDesc& src_desc,
const DynamicTensorCoordinate_t<SrcDesc>& src_origin_coord,
const SrcData* p_src,
......@@ -29,6 +31,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
const DynamicTensorCoordinate_t<DstDesc>& dst_origin_coord,
DstData* p_dst)
{
// comment: construction tensor coordinate here seems to cause scratch memory issue
auto src_coord = src_origin_coord;
auto dst_coord = dst_origin_coord;
......@@ -64,8 +67,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
#pragma unroll
for(index_t j1 = 0; j1 < J1; ++j1)
{
// do work
#if 0
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
......@@ -81,35 +83,115 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize());
#else
SrcData tmp;
// move dim1 iterator
if(j1 < J1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(j0 < J0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0);
}
}
}
}
// threadwise_dynamic_tensor_slice_transfer_v1r2 does not have scratch memory issue, due to
// it does not construct new tensor coordinate
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void
threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
DynamicTensorCoordinate_t<SrcDesc>& src_coord,
const SrcData* p_src,
const DstDesc& dst_desc,
DynamicTensorCoordinate_t<DstDesc>& dst_coord,
DstData* p_dst)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t J0 = SliceLengths{}[0];
constexpr index_t J1 = SliceLengths{}[1];
bool forward_dim0 = true;
bool forward_dim1 = true;
// hardcoded for 2d loop for now
#pragma unroll
for(index_t j0 = 0; j0 < J0; ++j0)
{
#pragma unroll
for(index_t j1 = 0; j1 < J1; ++j1)
{
// do work
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
1>(
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
src_desc.GetElementSpaceSize(),
&tmp,
0,
true,
1);
transfer_data<DstData, 1, AddressSpace::Vgpr, DstAddressSpace, DstInMemOp, 1, 1>(
&tmp,
0,
true,
1,
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize());
#endif
// move dim1 iterator
if(j1 < J1 - 1)
......@@ -145,8 +227,34 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
}
}
}
// move src and dst coordinate back to their origins
// hardcoded for 2d loop
if constexpr(J0 % 2 == 0)
{
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-(J0 - 1), 0));
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-(J0 - 1), 0));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
else
{
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-(J0 - 1), -(J1 - 1)));
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-(J0 - 1), -(J1 - 1)));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
}
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
......@@ -159,7 +267,7 @@ template <typename SrcDesc,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1
struct ThreadwiseDynamicTensorSliceTransfer_v1r1
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
......@@ -170,10 +278,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_desc_(src_desc),
src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_desc_(dst_desc),
......@@ -181,8 +289,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
{
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1()
: ThreadwiseDynamicTensorSliceTransfer_v1(
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1()
: ThreadwiseDynamicTensorSliceTransfer_v1r1(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
......@@ -190,20 +298,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
{
threadwise_dynamic_tensor_slice_transfer_v1<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorAccessDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
threadwise_dynamic_tensor_slice_transfer_v1r1<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorAccessDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc_, src_slice_origin_, p_src, dst_desc_, dst_slice_origin_, p_dst);
}
......@@ -243,5 +351,104 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
DstCoord dst_slice_origin_;
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1r2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2()
: ThreadwiseDynamicTensorSliceTransfer_v1r2(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
template <typename SrcData, typename DstData>
__device__ void
Run(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
threadwise_dynamic_tensor_slice_transfer_v1r2<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorAccessDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc, src_slice_origin_, p_src, dst_desc, dst_slice_origin_, p_dst);
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// is it OK to do this every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_slice_origin_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// is it OK to do this every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_slice_origin_step);
}
private:
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
} // namespace ck
#endif
......@@ -19,6 +19,20 @@ int main(int argc, char* argv[])
using namespace ck;
#if 1
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 71x71
constexpr index_t N = 128;
constexpr index_t C = 192;
......
......@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 1
#elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment