Commit 8e5085fa authored by Chao Liu's avatar Chao Liu
Browse files

fix scratch memory issue

parent b50fa980
...@@ -96,11 +96,14 @@ template <index_t BlockSize, ...@@ -96,11 +96,14 @@ template <index_t BlockSize,
index_t BlockCopyDataPerAccess_GemmN> index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCol2Im_gemmkgemmn_nchw struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
{ {
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename... Col, typename... Img> template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global, __device__ void Run_r1(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global, float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc, const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -117,53 +120,52 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -117,53 +120,52 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock; const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy = auto blockwise_copy =
#if 1 #if 1
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize, BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float, float,
float, float,
decltype(col_gemmk_gemmn_global_desc), decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc), decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>, Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN, BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Global, AddressSpace::Global,
InMemoryDataOperation::AtomicAdd, InMemoryDataOperation::AtomicAdd,
1, 1,
1>( 1>
#else #elif 1
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize, BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float, float,
float, float,
decltype(col_gemmk_gemmn_global_desc), decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc), decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>, Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN, BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder, BlockCopyDstAccessOrder,
1, 1,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Global, AddressSpace::Global,
InMemoryDataOperation::AtomicAdd, InMemoryDataOperation::AtomicAdd,
1, 1,
1>( 1>
#endif #endif
col_gemmk_gemmn_global_desc, (col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global), make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global)); make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock) for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{ {
...@@ -173,6 +175,80 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -173,6 +175,80 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0)); blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
} }
} }
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <typename... Col, typename... Img>
__device__ void Run_r2(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(col_gemmk_gemmn_global_desc,
p_col_global,
img_gemmk_gemmn_global_desc,
p_img_global);
blockwise_copy.MoveSrcSliceWindow(col_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(img_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Col, typename... Img>
__device__ void Run(const float* const __restrict__ p_col_global,
float* const __restrict__ p_img_global,
const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
{
Run_r2(
p_col_global, p_img_global, col_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc);
}
}; };
} // namespace ck } // namespace ck
......
...@@ -20,12 +20,11 @@ template <index_t BlockSize, ...@@ -20,12 +20,11 @@ template <index_t BlockSize,
index_t BlockCopyDataPerAccess_GemmN> index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCopy_gemmkgemmn struct DynamicGridwiseCopy_gemmkgemmn
{ {
#if 1
template <typename... Src, typename... Dst> template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global, __device__ void Run_r1(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global, float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc, const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -43,50 +42,50 @@ struct DynamicGridwiseCopy_gemmkgemmn ...@@ -43,50 +42,50 @@ struct DynamicGridwiseCopy_gemmkgemmn
// blockwise atomic accumulation // blockwise atomic accumulation
auto blockwise_copy = auto blockwise_copy =
#if 0 #if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize, BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float, float,
float, float,
decltype(src_gemmk_gemmn_global_desc), decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc), decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>, Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN, BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Global, AddressSpace::Global,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
1, 1,
1>( 1>
#else #elif 1
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize, BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float, float,
float, float,
decltype(src_gemmk_gemmn_global_desc), decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc), decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>, Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN, BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder, BlockCopyDstAccessOrder,
1, 1,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Global, AddressSpace::Global,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
1, 1,
1>( 1>
#endif #endif
src_gemmk_gemmn_global_desc, (src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global), make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc, dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global)); make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock) for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{ {
...@@ -96,12 +95,12 @@ struct DynamicGridwiseCopy_gemmkgemmn ...@@ -96,12 +95,12 @@ struct DynamicGridwiseCopy_gemmkgemmn
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0)); blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
} }
} }
#else
template <typename... Src, typename... Dst> template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global, __device__ void Run_r2(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global, float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc, const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -114,53 +113,58 @@ struct DynamicGridwiseCopy_gemmkgemmn ...@@ -114,53 +113,58 @@ struct DynamicGridwiseCopy_gemmkgemmn
const index_t block_work_id = get_block_1d_id(); const index_t block_work_id = get_block_1d_id();
// divide thread work by GemmK, GemmN const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
static constexpr auto thread_cluster_desc = make_cluster_descriptor(
BlockCopyClusterLengths_GemmK_GemmN{}, BlockCopyThreadClusterArrangeOrder{}); // blockwise atomic accumulation
auto blockwise_copy =
const auto thread_work_id = BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id()); float,
float,
// gemmk, gemmn decltype(src_gemmk_gemmn_global_desc),
constexpr index_t GemmKPerThread = BlockCopySubLengths_GemmK_GemmN::At(I0); decltype(dst_gemmk_gemmn_global_desc),
constexpr index_t GemmNPerThread = BlockCopySubLengths_GemmK_GemmN::At(I1); Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
const index_t gemmk_thread_data_on_global = BlockCopyClusterLengths_GemmK_GemmN,
thread_work_id[I0] * BlockCopySubLengths_GemmK_GemmN::At(I0); BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
const index_t gemmn_thread_data_on_global = BlockCopyDstAccessOrder,
block_work_id * GemmNPerBlock + 1,
thread_work_id[I1] * BlockCopySubLengths_GemmK_GemmN::At(I1); 1,
BlockCopyDataPerAccess_GemmN,
auto src_coord = make_dynamic_tensor_coordinate( BlockCopyDataPerAccess_GemmN,
src_gemmk_gemmn_global_desc, AddressSpace::Global,
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global)); AddressSpace::Global,
InMemoryDataOperation::Set,
auto dst_coord = make_dynamic_tensor_coordinate( 1,
dst_gemmk_gemmn_global_desc, 1>(
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global)); src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
threadwise_dynamic_tensor_slice_transfer_v1<float, dst_gemmk_gemmn_global_desc,
float, make_multi_index(0, gemmn_block_data_on_global));
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc), for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
BlockCopySubLengths_GemmK_GemmN, {
BlockCopyThreadClusterArrangeOrder, blockwise_copy.Run(src_gemmk_gemmn_global_desc,
1, p_src_global,
1, dst_gemmk_gemmn_global_desc,
1, p_dst_global);
AddressSpace::Global,
AddressSpace::Global, blockwise_copy.MoveSrcSliceWindow(src_gemmk_gemmn_global_desc,
InMemoryDataOperation::Set, make_multi_index(GemmKPerBlock, 0));
1, blockwise_copy.MoveDstSliceWindow(dst_gemmk_gemmn_global_desc,
1>(src_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
src_coord, }
p_src_global, }
dst_gemmk_gemmn_global_desc,
dst_coord, template <typename... Src, typename... Dst>
p_dst_global); __device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
Run_r2(
p_src_global, p_dst_global, src_gemmk_gemmn_global_desc, dst_gemmk_gemmn_global_desc);
} }
#endif
}; };
} // namespace ck } // namespace ck
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
namespace ck { namespace ck {
// this version does not have scratch memory issue, which is good, but I don't know why
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -27,17 +28,18 @@ template <index_t BlockSize, ...@@ -27,17 +28,18 @@ template <index_t BlockSize,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcDataStride, index_t SrcDataStride,
index_t DstDataStride> index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v1 struct BlockwiseDynamicTensorSliceTransfer_v1r1
{ {
static constexpr index_t nDim = static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension(); remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc, __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1r1(
const Index& src_block_slice_origin, const BlockSrcDesc& block_src_desc,
const BlockDstDesc& block_dst_desc, const Index& src_block_slice_origin,
const Index& dst_block_slice_origin) const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_transfer_(block_src_desc, : threadwise_transfer_(block_src_desc,
make_zero_multi_index<nDim>(), make_zero_multi_index<nDim>(),
block_dst_desc, block_dst_desc,
...@@ -103,22 +105,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v1 ...@@ -103,22 +105,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
static constexpr auto thread_cluster_desc_ = static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc, using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1r1<BlockSrcDesc,
BlockDstDesc, BlockDstDesc,
ThreadSliceLengths, ThreadSliceLengths,
SrcDstDimAccessOrder, SrcDstDimAccessOrder,
SrcDstVectoReadDim, SrcDstVectoReadDim,
SrcDataPerRead, SrcDataPerRead,
DstDataPerWrite, DstDataPerWrite,
SrcAddressSpace, SrcAddressSpace,
DstAddressSpace, DstAddressSpace,
DstInMemOp, DstInMemOp,
SrcDataStride, SrcDataStride,
DstDataStride>; DstDataStride>;
ThreadwiseTransfer threadwise_transfer_; ThreadwiseTransfer threadwise_transfer_;
}; };
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -139,17 +144,18 @@ template <index_t BlockSize, ...@@ -139,17 +144,18 @@ template <index_t BlockSize,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcDataStride, index_t SrcDataStride,
index_t DstDataStride> index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v2 struct BlockwiseDynamicTensorSliceTransfer_v2r1
{ {
static constexpr index_t nDim = static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension(); remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2(const BlockSrcDesc& block_src_desc, __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2r1(
const Index& src_block_slice_origin, const BlockSrcDesc& block_src_desc,
const BlockDstDesc& block_dst_desc, const Index& src_block_slice_origin,
const Index& dst_block_slice_origin) const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_read_(block_src_desc, : threadwise_read_(block_src_desc,
make_zero_multi_index<nDim>(), make_zero_multi_index<nDim>(),
thread_buffer_desc_, thread_buffer_desc_,
...@@ -246,31 +252,206 @@ struct BlockwiseDynamicTensorSliceTransfer_v2 ...@@ -246,31 +252,206 @@ struct BlockwiseDynamicTensorSliceTransfer_v2
static constexpr auto thread_buffer_desc_ = static constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{})); make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{}));
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc, using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1r1<BlockSrcDesc,
decltype(thread_buffer_desc_), decltype(thread_buffer_desc_),
ThreadSliceLengths, ThreadSliceLengths,
SrcDimAccessOrder, SrcDimAccessOrder,
SrcVectorReadDim, SrcVectorReadDim,
SrcDataPerRead, SrcDataPerRead,
1, 1,
SrcAddressSpace, SrcAddressSpace,
AddressSpace::Vgpr, AddressSpace::Vgpr,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
SrcDataStride, SrcDataStride,
1>; 1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1<decltype(thread_buffer_desc_), using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r1<decltype(thread_buffer_desc_),
BlockDstDesc, BlockDstDesc,
ThreadSliceLengths, ThreadSliceLengths,
DstDimAccessOrder, DstDimAccessOrder,
DstVectorWriteDim, DstVectorWriteDim,
1, 1,
DstDataPerWrite, DstDataPerWrite,
AddressSpace::Vgpr, AddressSpace::Vgpr,
DstAddressSpace, DstAddressSpace,
DstInMemOp, DstInMemOp,
1, 1,
DstDataStride>; DstDataStride>;
ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_;
static constexpr index_t thread_buffer_element_size_ =
thread_buffer_desc_.GetElementSpaceSize();
BlockSrcData p_thread_buffer_[thread_buffer_element_size_];
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorReadDim,
index_t DstVectorWriteDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v2r2
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2r2(
const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_read_(block_src_desc,
make_zero_multi_index<nDim>(),
thread_buffer_desc_,
make_zero_multi_index<nDim>()),
threadwise_write_(thread_buffer_desc_,
make_zero_multi_index<nDim>(),
block_dst_desc,
make_zero_multi_index<nDim>())
{
static_assert(
nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<BlockDstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() && nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_read_.SetSrcSliceOrigin(block_src_desc,
src_block_slice_origin + thread_data_id_begin);
threadwise_read_.SetDstSliceOrigin(thread_buffer_desc_, make_zero_multi_index<nDim>());
threadwise_write_.SetSrcSliceOrigin(thread_buffer_desc_, make_zero_multi_index<nDim>());
threadwise_write_.SetDstSliceOrigin(block_dst_desc,
dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ void RunRead(const BlockSrcDesc& block_src_desc, const BlockSrcData* p_block_src)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.Run(
block_src_desc, p_block_src, thread_buffer_desc_, p_thread_buffer_);
}
}
__device__ void RunWrite(const BlockDstDesc& block_dst_desc, BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.Run(
thread_buffer_desc_, p_thread_buffer_, block_dst_desc, p_block_dst);
}
}
__device__ void Run(const BlockSrcDesc& block_src_desc,
const BlockSrcData* p_block_src,
const BlockDstDesc& block_dst_desc,
BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.Run(
block_src_desc, p_block_src, thread_buffer_desc_, p_thread_buffer_);
// if there is type conversion, it's done during write
threadwise_write_.Run(
thread_buffer_desc_, p_thread_buffer_, block_dst_desc, p_block_dst);
}
}
__device__ void MoveSrcSliceWindow(const BlockSrcDesc& block_src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_read_.MoveSrcSliceWindow(block_src_desc, step);
}
}
__device__ void MoveDstSliceWindow(const BlockDstDesc& block_dst_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_write_.MoveDstSliceWindow(block_dst_desc, step);
}
}
private:
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
static constexpr auto thread_buffer_desc_ =
make_dynamic_native_tensor_descriptor_packed<nDim>(to_multi_index(ThreadSliceLengths{}));
using ThreadwiseRead = ThreadwiseDynamicTensorSliceTransfer_v1r2<BlockSrcDesc,
decltype(thread_buffer_desc_),
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectorReadDim,
SrcDataPerRead,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseWrite = ThreadwiseDynamicTensorSliceTransfer_v1r2<decltype(thread_buffer_desc_),
BlockDstDesc,
ThreadSliceLengths,
DstDimAccessOrder,
DstVectorWriteDim,
1,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>;
ThreadwiseRead threadwise_read_; ThreadwiseRead threadwise_read_;
ThreadwiseWrite threadwise_write_; ThreadwiseWrite threadwise_write_;
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
namespace ck { namespace ck {
// threadwise_dynamic_tensor_slice_transfer_v1r1 has scratch memory issue, due to
// it constructs new tensor coordinate
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
typename SrcDesc, typename SrcDesc,
...@@ -21,7 +23,7 @@ template <typename SrcData, ...@@ -21,7 +23,7 @@ template <typename SrcData,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector, index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector> index_t DstScalarStrideInVector>
__host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1r1(
const SrcDesc& src_desc, const SrcDesc& src_desc,
const DynamicTensorCoordinate_t<SrcDesc>& src_origin_coord, const DynamicTensorCoordinate_t<SrcDesc>& src_origin_coord,
const SrcData* p_src, const SrcData* p_src,
...@@ -29,6 +31,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -29,6 +31,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
const DynamicTensorCoordinate_t<DstDesc>& dst_origin_coord, const DynamicTensorCoordinate_t<DstDesc>& dst_origin_coord,
DstData* p_dst) DstData* p_dst)
{ {
// comment: construction tensor coordinate here seems to cause scratch memory issue
auto src_coord = src_origin_coord; auto src_coord = src_origin_coord;
auto dst_coord = dst_origin_coord; auto dst_coord = dst_origin_coord;
...@@ -64,8 +67,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -64,8 +67,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
#pragma unroll #pragma unroll
for(index_t j1 = 0; j1 < J1; ++j1) for(index_t j1 = 0; j1 < J1; ++j1)
{ {
// do work // do work
#if 0
transfer_data<SrcData, transfer_data<SrcData,
1, 1,
SrcAddressSpace, SrcAddressSpace,
...@@ -81,35 +83,115 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -81,35 +83,115 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
dst_coord.GetOffset(), dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord), coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize()); dst_desc.GetElementSpaceSize());
#else
SrcData tmp;
// move dim1 iterator
if(j1 < J1 - 1)
{
if(forward_dim1)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_p1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_p1);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_0_m1);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_0_m1);
}
}
}
// switch dim1 iteration direction
forward_dim1 = !forward_dim1;
// move dim0 iterator
if(j0 < J0 - 1)
{
if(forward_dim0)
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_p1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_p1_0);
}
else
{
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_m1_0);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_m1_0);
}
}
}
}
// threadwise_dynamic_tensor_slice_transfer_v1r2 does not have scratch memory issue, due to
// it does not construct new tensor coordinate
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
__host__ __device__ constexpr void
threadwise_dynamic_tensor_slice_transfer_v1r2(const SrcDesc& src_desc,
DynamicTensorCoordinate_t<SrcDesc>& src_coord,
const SrcData* p_src,
const DstDesc& dst_desc,
DynamicTensorCoordinate_t<DstDesc>& dst_coord,
DstData* p_dst)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto dst_step_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1));
const auto dst_step_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1));
const auto dst_step_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0));
const auto dst_step_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0));
constexpr index_t J0 = SliceLengths{}[0];
constexpr index_t J1 = SliceLengths{}[1];
bool forward_dim0 = true;
bool forward_dim1 = true;
// hardcoded for 2d loop for now
#pragma unroll
for(index_t j0 = 0; j0 < J0; ++j0)
{
#pragma unroll
for(index_t j1 = 0; j1 < J1; ++j1)
{
// do work
transfer_data<SrcData, transfer_data<SrcData,
1, 1,
SrcAddressSpace, SrcAddressSpace,
AddressSpace::Vgpr, DstAddressSpace,
InMemoryDataOperation::Set, DstInMemOp,
1, SrcScalarStrideInVector,
1>( DstScalarStrideInVector>(
p_src, p_src,
src_coord.GetOffset(), src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord), coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
src_desc.GetElementSpaceSize(), src_desc.GetElementSpaceSize(),
&tmp,
0,
true,
1);
transfer_data<DstData, 1, AddressSpace::Vgpr, DstAddressSpace, DstInMemOp, 1, 1>(
&tmp,
0,
true,
1,
p_dst, p_dst,
dst_coord.GetOffset(), dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord), coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize()); dst_desc.GetElementSpaceSize());
#endif
// move dim1 iterator // move dim1 iterator
if(j1 < J1 - 1) if(j1 < J1 - 1)
...@@ -145,8 +227,34 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -145,8 +227,34 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
} }
} }
} }
// move src and dst coordinate back to their origins
// hardcoded for 2d loop
if constexpr(J0 % 2 == 0)
{
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-(J0 - 1), 0));
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-(J0 - 1), 0));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
else
{
const auto src_step_back =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-(J0 - 1), -(J1 - 1)));
const auto dst_step_back =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-(J0 - 1), -(J1 - 1)));
move_dynamic_tensor_coordinate(src_desc, src_coord, src_step_back);
move_dynamic_tensor_coordinate(dst_desc, dst_coord, dst_step_back);
}
} }
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template <typename SrcDesc, template <typename SrcDesc,
typename DstDesc, typename DstDesc,
typename SliceLengths, typename SliceLengths,
...@@ -159,7 +267,7 @@ template <typename SrcDesc, ...@@ -159,7 +267,7 @@ template <typename SrcDesc,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector, index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector> index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1 struct ThreadwiseDynamicTensorSliceTransfer_v1r1
{ {
static constexpr index_t nDim = SliceLengths::Size(); static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
...@@ -170,10 +278,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1 ...@@ -170,10 +278,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{})); using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{})); using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1(const SrcDesc& src_desc, __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1(const SrcDesc& src_desc,
const Index& src_slice_origin, const Index& src_slice_origin,
const DstDesc& dst_desc, const DstDesc& dst_desc,
const Index& dst_slice_origin) const Index& dst_slice_origin)
: src_desc_(src_desc), : src_desc_(src_desc),
src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)), src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_desc_(dst_desc), dst_desc_(dst_desc),
...@@ -181,8 +289,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1 ...@@ -181,8 +289,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
{ {
} }
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1() __device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r1()
: ThreadwiseDynamicTensorSliceTransfer_v1( : ThreadwiseDynamicTensorSliceTransfer_v1r1(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>()) SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{ {
} }
...@@ -190,20 +298,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1 ...@@ -190,20 +298,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
template <typename SrcData, typename DstData> template <typename SrcData, typename DstData>
__device__ void Run(const SrcData* p_src, DstData* p_dst) const __device__ void Run(const SrcData* p_src, DstData* p_dst) const
{ {
threadwise_dynamic_tensor_slice_transfer_v1<SrcData, threadwise_dynamic_tensor_slice_transfer_v1r1<SrcData,
DstData, DstData,
SrcDesc, SrcDesc,
DstDesc, DstDesc,
SliceLengths, SliceLengths,
SrcDstDimAccessOrder, SrcDstDimAccessOrder,
SrcDstVectorAccessDim, SrcDstVectorAccessDim,
SrcScalarPerVector, SrcScalarPerVector,
DstScalarPerVector, DstScalarPerVector,
SrcAddressSpace, SrcAddressSpace,
DstAddressSpace, DstAddressSpace,
DstInMemOp, DstInMemOp,
SrcScalarStrideInVector, SrcScalarStrideInVector,
DstScalarStrideInVector>( DstScalarStrideInVector>(
src_desc_, src_slice_origin_, p_src, dst_desc_, dst_slice_origin_, p_dst); src_desc_, src_slice_origin_, p_src, dst_desc_, dst_slice_origin_, p_dst);
} }
...@@ -243,5 +351,104 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1 ...@@ -243,5 +351,104 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
DstCoord dst_slice_origin_; DstCoord dst_slice_origin_;
}; };
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorAccessDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector>
struct ThreadwiseDynamicTensorSliceTransfer_v1r2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
using SrcCoordStep = decltype(make_dynamic_tensor_coordinate_step(SrcDesc{}, Index{}));
using DstCoordStep = decltype(make_dynamic_tensor_coordinate_step(DstDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2(const SrcDesc& src_desc,
const Index& src_slice_origin,
const DstDesc& dst_desc,
const Index& dst_slice_origin)
: src_slice_origin_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin)),
dst_slice_origin_(make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin))
{
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r2()
: ThreadwiseDynamicTensorSliceTransfer_v1r2(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
template <typename SrcData, typename DstData>
__device__ void
Run(const SrcDesc& src_desc, const SrcData* p_src, const DstDesc& dst_desc, DstData* p_dst)
{
threadwise_dynamic_tensor_slice_transfer_v1r2<SrcData,
DstData,
SrcDesc,
DstDesc,
SliceLengths,
SrcDstDimAccessOrder,
SrcDstVectorAccessDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
src_desc, src_slice_origin_, p_src, dst_desc, dst_slice_origin_, p_dst);
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// is it OK to do this every time?
const auto src_slice_origin_step =
make_dynamic_tensor_coordinate_step(src_desc, src_slice_origin_step_idx);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_, src_slice_origin_step);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// is it OK to do this every time?
const auto dst_slice_origin_step =
make_dynamic_tensor_coordinate_step(dst_desc, dst_slice_origin_step_idx);
move_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_, dst_slice_origin_step);
}
private:
SrcCoord src_slice_origin_;
DstCoord dst_slice_origin_;
};
} // namespace ck } // namespace ck
#endif #endif
...@@ -19,6 +19,20 @@ int main(int argc, char* argv[]) ...@@ -19,6 +19,20 @@ int main(int argc, char* argv[])
using namespace ck; using namespace ck;
#if 1 #if 1
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t HI = 17;
constexpr index_t WI = 17;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
// 3x3, 71x71 // 3x3, 71x71
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
......
...@@ -561,7 +561,7 @@ int main(int argc, char* argv[]) ...@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment