Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8e5085fa
Commit
8e5085fa
authored
Nov 19, 2020
by
Chao Liu
Browse files
fix scratch memory issue
parent
b50fa980
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
718 additions
and
236 deletions
+718
-236
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
...nel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
+124
-48
composable_kernel/include/kernel_algorithm/dynamic_gridwise_copy_gemmkgemmn.hpp
...ude/kernel_algorithm/dynamic_gridwise_copy_gemmkgemmn.hpp
+103
-99
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+228
-47
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+248
-41
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+14
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
View file @
8e5085fa
...
@@ -96,11 +96,14 @@ template <index_t BlockSize,
...
@@ -96,11 +96,14 @@ template <index_t BlockSize,
index_t
BlockCopyDataPerAccess_GemmN
>
index_t
BlockCopyDataPerAccess_GemmN
>
struct
DynamicGridwiseCol2Im_gemmkgemmn_nchw
struct
DynamicGridwiseCol2Im_gemmkgemmn_nchw
{
{
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template
<
typename
...
Col
,
typename
...
Img
>
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_col_global
,
__device__
void
Run
_r1
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -117,53 +120,52 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
...
@@ -117,53 +120,52 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
// blockwise atomic accumulation
auto
blockwise_copy
=
auto
blockwise_copy
=
#if 1
#if 1
BlockwiseDynamicTensorSliceTransfer_v1
<
BlockSize
,
BlockwiseDynamicTensorSliceTransfer_v1
r1
<
BlockSize
,
float
,
float
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopySrcAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
,
1
>
(
1
>
#el
se
#el
if 1
BlockwiseDynamicTensorSliceTransfer_v2
<
BlockSize
,
BlockwiseDynamicTensorSliceTransfer_v2
r1
<
BlockSize
,
float
,
float
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
,
1
>
(
1
>
#endif
#endif
col_gemmk_gemmn_global_desc
,
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
make_multi_index
(
0
,
gemmn_block_data_on_global
),
img_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
make_multi_index
(
0
,
gemmn_block_data_on_global
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
{
{
...
@@ -173,6 +175,80 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
...
@@ -173,6 +175,80 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
}
}
}
}
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run_r2
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
index_t
GemmK
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I0
);
const
index_t
GemmN
=
col_gemmk_gemmn_global_desc
.
GetLength
(
I1
);
// divide block work by GemmN
const
index_t
GemmNBlockWork
=
GemmN
/
GemmNPerBlock
;
const
index_t
block_work_id
=
get_block_1d_id
();
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
auto
blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v2r2
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
{
blockwise_copy
.
Run
(
col_gemmk_gemmn_global_desc
,
p_col_global
,
img_gemmk_gemmn_global_desc
,
p_img_global
);
blockwise_copy
.
MoveSrcSliceWindow
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
blockwise_copy
.
MoveDstSliceWindow
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
}
}
template
<
typename
...
Col
,
typename
...
Img
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_col_global
,
float
*
const
__restrict__
p_img_global
,
const
DynamicTensorDescriptor
<
Col
...
>&
col_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Img
...
>&
img_gemmk_gemmn_global_desc
)
const
{
Run_r2
(
p_col_global
,
p_img_global
,
col_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_global_desc
);
}
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/kernel_algorithm/dynamic_gridwise_copy_gemmkgemmn.hpp
View file @
8e5085fa
...
@@ -20,12 +20,11 @@ template <index_t BlockSize,
...
@@ -20,12 +20,11 @@ template <index_t BlockSize,
index_t
BlockCopyDataPerAccess_GemmN
>
index_t
BlockCopyDataPerAccess_GemmN
>
struct
DynamicGridwiseCopy_gemmkgemmn
struct
DynamicGridwiseCopy_gemmkgemmn
{
{
#if 1
template
<
typename
...
Src
,
typename
...
Dst
>
template
<
typename
...
Src
,
typename
...
Dst
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_src_global
,
__device__
void
Run
_r1
(
const
float
*
const
__restrict__
p_src_global
,
float
*
const
__restrict__
p_dst_global
,
float
*
const
__restrict__
p_dst_global
,
const
DynamicTensorDescriptor
<
Src
...
>&
src_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Src
...
>&
src_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Dst
...
>&
dst_gemmk_gemmn_global_desc
)
const
const
DynamicTensorDescriptor
<
Dst
...
>&
dst_gemmk_gemmn_global_desc
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -43,50 +42,50 @@ struct DynamicGridwiseCopy_gemmkgemmn
...
@@ -43,50 +42,50 @@ struct DynamicGridwiseCopy_gemmkgemmn
// blockwise atomic accumulation
// blockwise atomic accumulation
auto
blockwise_copy
=
auto
blockwise_copy
=
#if 0
#if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
BlockwiseDynamicTensorSliceTransfer_v1
r1
<BlockSize,
float,
float,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopySrcAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
InMemoryDataOperation::Set,
1,
1,
1>
(
1>
#el
se
#el
if
1
BlockwiseDynamicTensorSliceTransfer_v2
<
BlockSize
,
BlockwiseDynamicTensorSliceTransfer_v2
r1
<
BlockSize
,
float
,
float
,
float
,
float
,
decltype
(
src_gemmk_gemmn_global_desc
),
decltype
(
src_gemmk_gemmn_global_desc
),
decltype
(
dst_gemmk_gemmn_global_desc
),
decltype
(
dst_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
1
,
1
,
1
>
(
1
>
#endif
#endif
src_gemmk_gemmn_global_desc
,
(
src_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
make_multi_index
(
0
,
gemmn_block_data_on_global
),
dst_gemmk_gemmn_global_desc
,
dst_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
make_multi_index
(
0
,
gemmn_block_data_on_global
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
{
{
...
@@ -96,12 +95,12 @@ struct DynamicGridwiseCopy_gemmkgemmn
...
@@ -96,12 +95,12 @@ struct DynamicGridwiseCopy_gemmkgemmn
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
}
}
}
}
#else
template
<
typename
...
Src
,
typename
...
Dst
>
template
<
typename
...
Src
,
typename
...
Dst
>
__device__
void
Run
(
const
float
*
const
__restrict__
p_src_global
,
__device__
void
Run
_r2
(
const
float
*
const
__restrict__
p_src_global
,
float
*
const
__restrict__
p_dst_global
,
float
*
const
__restrict__
p_dst_global
,
const
DynamicTensorDescriptor
<
Src
...
>&
src_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Src
...
>&
src_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Dst
...
>&
dst_gemmk_gemmn_global_desc
)
const
const
DynamicTensorDescriptor
<
Dst
...
>&
dst_gemmk_gemmn_global_desc
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -114,53 +113,58 @@ struct DynamicGridwiseCopy_gemmkgemmn
...
@@ -114,53 +113,58 @@ struct DynamicGridwiseCopy_gemmkgemmn
const
index_t
block_work_id
=
get_block_1d_id
();
const
index_t
block_work_id
=
get_block_1d_id
();
// divide thread work by GemmK, GemmN
const
index_t
gemmn_block_data_on_global
=
block_work_id
*
GemmNPerBlock
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
BlockCopyClusterLengths_GemmK_GemmN
{},
BlockCopyThreadClusterArrangeOrder
{});
// blockwise atomic accumulation
auto
blockwise_copy
=
const
auto
thread_work_id
=
BlockwiseDynamicTensorSliceTransfer_v2r2
<
BlockSize
,
thread_cluster_desc
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
float
,
float
,
// gemmk, gemmn
decltype
(
src_gemmk_gemmn_global_desc
),
constexpr
index_t
GemmKPerThread
=
BlockCopySubLengths_GemmK_GemmN
::
At
(
I0
);
decltype
(
dst_gemmk_gemmn_global_desc
),
constexpr
index_t
GemmNPerThread
=
BlockCopySubLengths_GemmK_GemmN
::
At
(
I1
);
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
const
index_t
gemmk_thread_data_on_global
=
BlockCopyClusterLengths_GemmK_GemmN
,
thread_work_id
[
I0
]
*
BlockCopySubLengths_GemmK_GemmN
::
At
(
I0
);
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
const
index_t
gemmn_thread_data_on_global
=
BlockCopyDstAccessOrder
,
block_work_id
*
GemmNPerBlock
+
1
,
thread_work_id
[
I1
]
*
BlockCopySubLengths_GemmK_GemmN
::
At
(
I1
);
1
,
BlockCopyDataPerAccess_GemmN
,
auto
src_coord
=
make_dynamic_tensor_coordinate
(
BlockCopyDataPerAccess_GemmN
,
src_gemmk_gemmn_global_desc
,
AddressSpace
::
Global
,
make_multi_index
(
gemmk_thread_data_on_global
,
gemmn_thread_data_on_global
));
AddressSpace
::
Global
,
InMemoryDataOperation
::
Set
,
auto
dst_coord
=
make_dynamic_tensor_coordinate
(
1
,
dst_gemmk_gemmn_global_desc
,
1
>
(
make_multi_index
(
gemmk_thread_data_on_global
,
gemmn_thread_data_on_global
));
src_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
threadwise_dynamic_tensor_slice_transfer_v1
<
float
,
dst_gemmk_gemmn_global_desc
,
float
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
decltype
(
src_gemmk_gemmn_global_desc
),
decltype
(
dst_gemmk_gemmn_global_desc
),
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
BlockCopySubLengths_GemmK_GemmN
,
{
BlockCopyThreadClusterArrangeOrder
,
blockwise_copy
.
Run
(
src_gemmk_gemmn_global_desc
,
1
,
p_src_global
,
1
,
dst_gemmk_gemmn_global_desc
,
1
,
p_dst_global
);
AddressSpace
::
Global
,
AddressSpace
::
Global
,
blockwise_copy
.
MoveSrcSliceWindow
(
src_gemmk_gemmn_global_desc
,
InMemoryDataOperation
::
Set
,
make_multi_index
(
GemmKPerBlock
,
0
));
1
,
blockwise_copy
.
MoveDstSliceWindow
(
dst_gemmk_gemmn_global_desc
,
1
>
(
src_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
src_coord
,
}
p_src_global
,
}
dst_gemmk_gemmn_global_desc
,
dst_coord
,
template
<
typename
...
Src
,
typename
...
Dst
>
p_dst_global
);
__device__
void
Run
(
const
float
*
const
__restrict__
p_src_global
,
float
*
const
__restrict__
p_dst_global
,
const
DynamicTensorDescriptor
<
Src
...
>&
src_gemmk_gemmn_global_desc
,
const
DynamicTensorDescriptor
<
Dst
...
>&
dst_gemmk_gemmn_global_desc
)
const
{
Run_r2
(
p_src_global
,
p_dst_global
,
src_gemmk_gemmn_global_desc
,
dst_gemmk_gemmn_global_desc
);
}
}
#endif
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
8e5085fa
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
namespace
ck
{
namespace
ck
{
// this version does not have scratch memory issue, which is good, but I don't know why
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -27,17 +28,18 @@ template <index_t BlockSize,
...
@@ -27,17 +28,18 @@ template <index_t BlockSize,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v1
struct
BlockwiseDynamicTensorSliceTransfer_v1
r1
{
{
static
constexpr
index_t
nDim
=
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v1
(
const
BlockSrcDesc
&
block_src_desc
,
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v1r1
(
const
Index
&
src_block_slice_origin
,
const
BlockSrcDesc
&
block_src_desc
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_transfer_
(
block_src_desc
,
:
threadwise_transfer_
(
block_src_desc
,
make_zero_multi_index
<
nDim
>
(),
make_zero_multi_index
<
nDim
>
(),
block_dst_desc
,
block_dst_desc
,
...
@@ -103,22 +105,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
...
@@ -103,22 +105,25 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
static
constexpr
auto
thread_cluster_desc_
=
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
BlockSrcDesc
,
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v1
r1
<
BlockSrcDesc
,
BlockDstDesc
,
BlockDstDesc
,
ThreadSliceLengths
,
ThreadSliceLengths
,
SrcDstDimAccessOrder
,
SrcDstDimAccessOrder
,
SrcDstVectoReadDim
,
SrcDstVectoReadDim
,
SrcDataPerRead
,
SrcDataPerRead
,
DstDataPerWrite
,
DstDataPerWrite
,
SrcAddressSpace
,
SrcAddressSpace
,
DstAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
DstInMemOp
,
SrcDataStride
,
SrcDataStride
,
DstDataStride
>
;
DstDataStride
>
;
ThreadwiseTransfer
threadwise_transfer_
;
ThreadwiseTransfer
threadwise_transfer_
;
};
};
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -139,17 +144,18 @@ template <index_t BlockSize,
...
@@ -139,17 +144,18 @@ template <index_t BlockSize,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v2
struct
BlockwiseDynamicTensorSliceTransfer_v2
r1
{
{
static
constexpr
index_t
nDim
=
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v2
(
const
BlockSrcDesc
&
block_src_desc
,
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v2r1
(
const
Index
&
src_block_slice_origin
,
const
BlockSrcDesc
&
block_src_desc
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_read_
(
block_src_desc
,
:
threadwise_read_
(
block_src_desc
,
make_zero_multi_index
<
nDim
>
(),
make_zero_multi_index
<
nDim
>
(),
thread_buffer_desc_
,
thread_buffer_desc_
,
...
@@ -246,31 +252,206 @@ struct BlockwiseDynamicTensorSliceTransfer_v2
...
@@ -246,31 +252,206 @@ struct BlockwiseDynamicTensorSliceTransfer_v2
static
constexpr
auto
thread_buffer_desc_
=
static
constexpr
auto
thread_buffer_desc_
=
make_dynamic_native_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
ThreadSliceLengths
{}));
make_dynamic_native_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
ThreadSliceLengths
{}));
using
ThreadwiseRead
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
BlockSrcDesc
,
using
ThreadwiseRead
=
ThreadwiseDynamicTensorSliceTransfer_v1r1
<
BlockSrcDesc
,
decltype
(
thread_buffer_desc_
),
decltype
(
thread_buffer_desc_
),
ThreadSliceLengths
,
ThreadSliceLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorReadDim
,
SrcVectorReadDim
,
SrcDataPerRead
,
SrcDataPerRead
,
1
,
1
,
SrcAddressSpace
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
SrcDataStride
,
SrcDataStride
,
1
>
;
1
>
;
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
decltype
(
thread_buffer_desc_
),
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1r1
<
decltype
(
thread_buffer_desc_
),
BlockDstDesc
,
BlockDstDesc
,
ThreadSliceLengths
,
ThreadSliceLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorWriteDim
,
DstVectorWriteDim
,
1
,
1
,
DstDataPerWrite
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
DstInMemOp
,
1
,
1
,
DstDataStride
>
;
DstDataStride
>
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseWrite
threadwise_write_
;
static
constexpr
index_t
thread_buffer_element_size_
=
thread_buffer_desc_
.
GetElementSpaceSize
();
BlockSrcData
p_thread_buffer_
[
thread_buffer_element_size_
];
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorReadDim
,
index_t
DstVectorWriteDim
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v2r2
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v2r2
(
const
BlockSrcDesc
&
block_src_desc
,
const
Index
&
src_block_slice_origin
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_read_
(
block_src_desc
,
make_zero_multi_index
<
nDim
>
(),
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
()),
threadwise_write_
(
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
(),
block_dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
static_assert
(
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
()
&&
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockDstDesc
>>::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong! BlockSize too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
thread_cluster_id
*
ThreadSliceLengths
{};
threadwise_read_
.
SetSrcSliceOrigin
(
block_src_desc
,
src_block_slice_origin
+
thread_data_id_begin
);
threadwise_read_
.
SetDstSliceOrigin
(
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
());
threadwise_write_
.
SetSrcSliceOrigin
(
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
());
threadwise_write_
.
SetDstSliceOrigin
(
block_dst_desc
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
void
RunRead
(
const
BlockSrcDesc
&
block_src_desc
,
const
BlockSrcData
*
p_block_src
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_read_
.
Run
(
block_src_desc
,
p_block_src
,
thread_buffer_desc_
,
p_thread_buffer_
);
}
}
__device__
void
RunWrite
(
const
BlockDstDesc
&
block_dst_desc
,
BlockDstData
*
p_block_dst
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_write_
.
Run
(
thread_buffer_desc_
,
p_thread_buffer_
,
block_dst_desc
,
p_block_dst
);
}
}
__device__
void
Run
(
const
BlockSrcDesc
&
block_src_desc
,
const
BlockSrcData
*
p_block_src
,
const
BlockDstDesc
&
block_dst_desc
,
BlockDstData
*
p_block_dst
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_read_
.
Run
(
block_src_desc
,
p_block_src
,
thread_buffer_desc_
,
p_thread_buffer_
);
// if there is type conversion, it's done during write
threadwise_write_
.
Run
(
thread_buffer_desc_
,
p_thread_buffer_
,
block_dst_desc
,
p_block_dst
);
}
}
__device__
void
MoveSrcSliceWindow
(
const
BlockSrcDesc
&
block_src_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_read_
.
MoveSrcSliceWindow
(
block_src_desc
,
step
);
}
}
__device__
void
MoveDstSliceWindow
(
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_write_
.
MoveDstSliceWindow
(
block_dst_desc
,
step
);
}
}
private:
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
static
constexpr
auto
thread_buffer_desc_
=
make_dynamic_native_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
ThreadSliceLengths
{}));
using
ThreadwiseRead
=
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
BlockSrcDesc
,
decltype
(
thread_buffer_desc_
),
ThreadSliceLengths
,
SrcDimAccessOrder
,
SrcVectorReadDim
,
SrcDataPerRead
,
1
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
SrcDataStride
,
1
>
;
using
ThreadwiseWrite
=
ThreadwiseDynamicTensorSliceTransfer_v1r2
<
decltype
(
thread_buffer_desc_
),
BlockDstDesc
,
ThreadSliceLengths
,
DstDimAccessOrder
,
DstVectorWriteDim
,
1
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
,
1
,
DstDataStride
>
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseRead
threadwise_read_
;
ThreadwiseWrite
threadwise_write_
;
ThreadwiseWrite
threadwise_write_
;
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
8e5085fa
...
@@ -7,6 +7,8 @@
...
@@ -7,6 +7,8 @@
namespace
ck
{
namespace
ck
{
// threadwise_dynamic_tensor_slice_transfer_v1r1 has scratch memory issue, due to
// it constructs new tensor coordinate
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
,
typename
DstData
,
typename
SrcDesc
,
typename
SrcDesc
,
...
@@ -21,7 +23,7 @@ template <typename SrcData,
...
@@ -21,7 +23,7 @@ template <typename SrcData,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
index_t
DstScalarStrideInVector
>
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1
(
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1
r1
(
const
SrcDesc
&
src_desc
,
const
SrcDesc
&
src_desc
,
const
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_origin_coord
,
const
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_origin_coord
,
const
SrcData
*
p_src
,
const
SrcData
*
p_src
,
...
@@ -29,6 +31,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
...
@@ -29,6 +31,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
const
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_origin_coord
,
const
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_origin_coord
,
DstData
*
p_dst
)
DstData
*
p_dst
)
{
{
// comment: construction tensor coordinate here seems to cause scratch memory issue
auto
src_coord
=
src_origin_coord
;
auto
src_coord
=
src_origin_coord
;
auto
dst_coord
=
dst_origin_coord
;
auto
dst_coord
=
dst_origin_coord
;
...
@@ -64,8 +67,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
...
@@ -64,8 +67,7 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
#pragma unroll
#pragma unroll
for
(
index_t
j1
=
0
;
j1
<
J1
;
++
j1
)
for
(
index_t
j1
=
0
;
j1
<
J1
;
++
j1
)
{
{
// do work
// do work
#if 0
transfer_data
<
SrcData
,
transfer_data
<
SrcData
,
1
,
1
,
SrcAddressSpace
,
SrcAddressSpace
,
...
@@ -81,35 +83,115 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
...
@@ -81,35 +83,115 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
dst_coord
.
GetOffset
(),
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
dst_desc
.
GetElementSpaceSize
());
#else
SrcData
tmp
;
// move dim1 iterator
if
(
j1
<
J1
-
1
)
{
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_0_m1
);
}
}
}
// switch dim1 iteration direction
forward_dim1
=
!
forward_dim1
;
// move dim0 iterator
if
(
j0
<
J0
-
1
)
{
if
(
forward_dim0
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_m1_0
);
}
}
}
}
// threadwise_dynamic_tensor_slice_transfer_v1r2 does not have scratch memory issue, due to
// it does not construct new tensor coordinate
template
<
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorAccessDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
__host__
__device__
constexpr
void
threadwise_dynamic_tensor_slice_transfer_v1r2
(
const
SrcDesc
&
src_desc
,
DynamicTensorCoordinate_t
<
SrcDesc
>&
src_coord
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DynamicTensorCoordinate_t
<
DstDesc
>&
dst_coord
,
DstData
*
p_dst
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
J0
=
SliceLengths
{}[
0
];
constexpr
index_t
J1
=
SliceLengths
{}[
1
];
bool
forward_dim0
=
true
;
bool
forward_dim1
=
true
;
// hardcoded for 2d loop for now
#pragma unroll
for
(
index_t
j0
=
0
;
j0
<
J0
;
++
j0
)
{
#pragma unroll
for
(
index_t
j1
=
0
;
j1
<
J1
;
++
j1
)
{
// do work
transfer_data
<
SrcData
,
transfer_data
<
SrcData
,
1
,
1
,
SrcAddressSpace
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
Dst
AddressSpace
,
InMem
oryDataOperation
::
Set
,
Dst
InMem
Op
,
1
,
SrcScalarStrideInVector
,
1
>
(
DstScalarStrideInVector
>
(
p_src
,
p_src
,
src_coord
.
GetOffset
(),
src_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord
),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord
),
src_desc
.
GetElementSpaceSize
(),
src_desc
.
GetElementSpaceSize
(),
&
tmp
,
0
,
true
,
1
);
transfer_data
<
DstData
,
1
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
,
1
,
1
>
(
&
tmp
,
0
,
true
,
1
,
p_dst
,
p_dst
,
dst_coord
.
GetOffset
(),
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
dst_desc
.
GetElementSpaceSize
());
#endif
// move dim1 iterator
// move dim1 iterator
if
(
j1
<
J1
-
1
)
if
(
j1
<
J1
-
1
)
...
@@ -145,8 +227,34 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
...
@@ -145,8 +227,34 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
}
}
}
}
}
}
// move src and dst coordinate back to their origins
// hardcoded for 2d loop
if
constexpr
(
J0
%
2
==
0
)
{
const
auto
src_step_back
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
(
J0
-
1
),
0
));
const
auto
dst_step_back
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
(
J0
-
1
),
0
));
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_back
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_back
);
}
else
{
const
auto
src_step_back
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
(
J0
-
1
),
-
(
J1
-
1
)));
const
auto
dst_step_back
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
(
J0
-
1
),
-
(
J1
-
1
)));
move_dynamic_tensor_coordinate
(
src_desc
,
src_coord
,
src_step_back
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_coord
,
dst_step_back
);
}
}
}
// this version has scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
template
<
typename
SrcDesc
,
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SliceLengths
,
...
@@ -159,7 +267,7 @@ template <typename SrcDesc,
...
@@ -159,7 +267,7 @@ template <typename SrcDesc,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
index_t
DstScalarStrideInVector
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1
struct
ThreadwiseDynamicTensorSliceTransfer_v1
r1
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
...
@@ -170,10 +278,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
...
@@ -170,10 +278,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
(
const
SrcDesc
&
src_desc
,
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
const
Index
&
dst_slice_origin
)
:
src_desc_
(
src_desc
),
:
src_desc_
(
src_desc
),
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_desc_
(
dst_desc
),
dst_desc_
(
dst_desc
),
...
@@ -181,8 +289,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
...
@@ -181,8 +289,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
{
{
}
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
()
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1
r1
()
:
ThreadwiseDynamicTensorSliceTransfer_v1
(
:
ThreadwiseDynamicTensorSliceTransfer_v1
r1
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
{
}
}
...
@@ -190,20 +298,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
...
@@ -190,20 +298,20 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
template
<
typename
SrcData
,
typename
DstData
>
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
{
threadwise_dynamic_tensor_slice_transfer_v1
<
SrcData
,
threadwise_dynamic_tensor_slice_transfer_v1
r1
<
SrcData
,
DstData
,
DstData
,
SrcDesc
,
SrcDesc
,
DstDesc
,
DstDesc
,
SliceLengths
,
SliceLengths
,
SrcDstDimAccessOrder
,
SrcDstDimAccessOrder
,
SrcDstVectorAccessDim
,
SrcDstVectorAccessDim
,
SrcScalarPerVector
,
SrcScalarPerVector
,
DstScalarPerVector
,
DstScalarPerVector
,
SrcAddressSpace
,
SrcAddressSpace
,
DstAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
DstInMemOp
,
SrcScalarStrideInVector
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
DstScalarStrideInVector
>
(
src_desc_
,
src_slice_origin_
,
p_src
,
dst_desc_
,
dst_slice_origin_
,
p_dst
);
src_desc_
,
src_slice_origin_
,
p_src
,
dst_desc_
,
dst_slice_origin_
,
p_dst
);
}
}
...
@@ -243,5 +351,104 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
...
@@ -243,5 +351,104 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1
DstCoord
dst_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
};
// this version does not have scratch memory issue, due to:
// 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
// 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectorAccessDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
>
struct
ThreadwiseDynamicTensorSliceTransfer_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r2
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r2
()
:
ThreadwiseDynamicTensorSliceTransfer_v1r2
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
}
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
threadwise_dynamic_tensor_slice_transfer_v1r2
<
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SliceLengths
,
SrcDstDimAccessOrder
,
SrcDstVectorAccessDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
src_desc
,
src_slice_origin_
,
p_src
,
dst_desc
,
dst_slice_origin_
,
p_dst
);
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
src_slice_origin_
=
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// is it OK to do this every time?
const
auto
src_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
src_slice_origin_step_idx
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_slice_origin_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// is it OK to do this every time?
const
auto
dst_slice_origin_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
dst_slice_origin_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_slice_origin_step
);
}
private:
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
driver/src/col2im_driver.cpp
View file @
8e5085fa
...
@@ -19,6 +19,20 @@ int main(int argc, char* argv[])
...
@@ -19,6 +19,20 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
#if 1
#if 1
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
2
,
2
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
// 3x3, 71x71
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
192
;
...
...
driver/src/conv_driver.cpp
View file @
8e5085fa
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
...
@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment