Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
c01af899
You need to sign in or sign up before continuing.
Commit
c01af899
authored
Aug 03, 2019
by
Chao Liu
Browse files
added new tensor copy operator
parent
e9733a9f
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
214 additions
and
195 deletions
+214
-195
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-27
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+66
-19
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-27
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+93
-63
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+17
-39
composable_kernel/include/utility/Sequence.hpp
composable_kernel/include/utility/Sequence.hpp
+33
-18
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-0
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
...
@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// do work
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
{
#if 0 // debug
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
#else
using
InSrcMergedDimSubLengthsHack
=
Sequence
<
InBlockCopySubLengths_E_N1_B_N2
{}[
0
],
1
,
InBlockCopySubLengths_E_N1_B_N2
{}[
2
],
1
>
;
using
InDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
>
;
blockwise_in_copy
.
Run_hack
(
p_in_global
,
p_in_block
,
InSrcMergedDimSubLengthsHack
{},
InDstMergedDimSubLengthsHack
{});
using
WeiSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
using
WeiDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_wei_copy
.
Run_hack
(
p_wei_global
,
p_wei_block
,
WeiSrcMergedDimSubLengthsHack
{},
WeiDstMergedDimSubLengthsHack
{});
#endif
__syncthreads
();
...
...
@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
#else
using
OutSrcMergedDimSliceLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
;
using
OutDstMergedDimSliceLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
;
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
...
...
@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run_hack
(
p_out_thread
,
p_out_thread_on_global
,
OutSrcMergedDimSliceLengthsHack
{},
OutDstMergedDimSliceLengthsHack
{});
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
c01af899
...
...
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
#if 1 // debug
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
...
...
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#else
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
in_e_n1_b_n2_global_merged_desc
),
decltype
(
in_e_n1_b_n2_block_desc
),
MergedTensorCoordinate
<
decltype
(
in_e_n1_b_n2_global_merged_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
in_e_n1_b_n2_block_desc
)
>
,
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#endif
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
...
...
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
#if 1 // debug
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
...
...
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
#else
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
NormalTensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
#endif
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
...
...
@@ -291,54 +319,61 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_register_
clipboard
[
blockwise_in_copy
.
GetRegister
Clipboard
Size
()];
Float
p_wei_register_
clipboard
[
blockwise_wei_copy
.
GetRegister
Clipboard
Size
()];
Float
p_in_register_
buffer
[
blockwise_in_copy
.
GetRegister
Buffer
Size
()];
Float
p_wei_register_
buffer
[
blockwise_wei_copy
.
GetRegister
Buffer
Size
()];
#if 1
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#else
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
#endif
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegister
Clipboard
(
p_in_global
,
p_in_register_
clipboard
);
blockwise_wei_copy
.
RunLoadRegister
Clipboard
(
p_wei_block_on_global
,
p_wei_register_
clipboard
);
blockwise_in_copy
.
RunLoadRegister
Buffer
(
p_in_global
,
p_in_register_
buffer
);
blockwise_wei_copy
.
RunLoadRegister
Buffer
(
p_wei_block_on_global
,
p_wei_register_
buffer
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_block_next
);
}
}
// LDS double buffer: tail
{
Float
p_in_register_
clipboard
[
blockwise_in_copy
.
GetRegister
Clipboard
Size
()];
Float
p_wei_register_
clipboard
[
blockwise_wei_copy
.
GetRegister
Clipboard
Size
()];
Float
p_in_register_
buffer
[
blockwise_in_copy
.
GetRegister
Buffer
Size
()];
Float
p_wei_register_
buffer
[
blockwise_wei_copy
.
GetRegister
Buffer
Size
()];
// even iteration
// even iteration
#if 1
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#else
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
#endif
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global
,
p_in_register_clipboard
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
p_wei_register_clipboard
);
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_wei_register_buffer
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegister
Clipboard
(
p_in_register_
clipboard
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegister
Clipboard
(
p_wei_register_
clipboard
,
p_wei_block_double
+
wei_block_space
);
blockwise_in_copy
.
RunStoreRegister
Buffer
(
p_in_register_
buffer
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegister
Buffer
(
p_wei_register_
buffer
,
p_wei_block_double
+
wei_block_space
);
// odd iteration
__syncthreads
();
...
...
@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if 1 // debug
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
...
...
@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
Number
<
1
>
{});
#else
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
}
}
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
...
@@ -11,7 +11,7 @@
namespace
ck
{
// B = merge(N, H, W)
// B = merge(N, H
o
, W
o
)
template
<
index_t
GridSize
,
index_t
BlockSize
,
class
Float
,
...
...
@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for
(
index_t
e_block_data_begin
=
0
;
e_block_data_begin
<
E
;
e_block_data_begin
+=
EPerBlock
)
{
#if 0
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
#else
using
InSrcMergedDimSubLengthsHack
=
InBlockCopySubLengths_E_B
;
using
InDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_in_copy
.
Run_hack
(
p_in_global
,
p_in_block
,
InSrcMergedDimSubLengthsHack
{},
InDstMergedDimSubLengthsHack
{});
using
WeiSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
using
WeiDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_wei_copy
.
Run_hack
(
p_wei_global
,
p_wei_block
,
WeiSrcMergedDimSubLengthsHack
{},
WeiDstMergedDimSubLengthsHack
{});
#endif
__syncthreads
();
...
...
@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
#if 0
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
#else
using
OutSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
1
>
;
using
OutDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
OutThreadCopySliceLengths
{}[
2
]
>
;
threadwise_out_copy
.
Run_hack
(
p_out_thread
,
p_out_global
,
OutSrcMergedDimSubLengthsHack
{},
OutDstMergedDimSubLengthsHack
{});
#endif
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
c01af899
...
...
@@ -24,7 +24,7 @@ template <index_t BlockSize,
class
DstDesc
,
class
SliceLengths
,
class
SubLengths
,
class
Data
ClusterLengths
,
class
Thread
ClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
SrcAccessOrder
,
class
DstAccessOrder
,
...
...
@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
DataClusterLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
"wrong"
);
...
...
@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
Data
ClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
Thread
ClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
Data
ClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
Thread
ClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
...
...
@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
__device__
static
constexpr
index_t
GetRegister
Clipboard
Size
()
__device__
static
constexpr
index_t
GetRegister
Buffer
Size
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
...
...
@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
return
thread_tensor_desc
.
GetElementSpace
();
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_
clipboard
)
const
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_
Buffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
DataClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
...
...
@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
clipboard_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
Buffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
constexpr
index_t
clipboard
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
constexpr
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
clipboard
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
Buffer
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
const
index_t
clipboard
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
const
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
...
...
@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src
+
src_offset
+
mThreadSrcOffset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_tensor_desc
,
p_
clipboard
+
clipboard
_offset
,
p_
Buffer
+
Buffer
_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
thread_sub_tensor_lengths
,
SrcAccessOrder
{},
...
...
@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_
clipboard
,
Float
*
__restrict__
p_dst
)
const
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_
Buffer
,
Float
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
DataClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
constexpr
auto
clipboard_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
Buffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
index_t
clipboard
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
constexpr
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
clipboard
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
Buffer
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
index_t
clipboard
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
const
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#endif
...
...
@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
p_
clipboard
+
clipboard
_offset
,
p_
Buffer
+
Buffer
_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
DstDesc
{},
p_dst
+
dst_offset
+
mThreadDstOffset
,
...
...
@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
Float
p_
clipboard
[
GetRegister
Clipboard
Size
()];
Float
p_
Buffer
[
GetRegister
Buffer
Size
()];
RunLoadRegister
Clipboard
(
p_src
,
p_
clipboard
);
RunStoreRegister
Clipboard
(
p_clipboard
,
p_dst
);
RunLoadRegister
Buffer
(
p_src
,
p_
Buffer
);
RunStoreRegister
Buffer
(
p_Buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
...
...
@@ -382,24 +383,30 @@ template <index_t BlockSize,
class
DstCoordinate
,
class
SliceLengths
,
class
SubLengths
,
class
Data
ClusterLengths
,
class
Thread
ClusterLengths
,
class
ThreadClusterArrangeOrder
>
struct
BlockwiseGenericTensorSliceCopy_v2
{
using
ThreadwiseCopy
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
SrcDesc
,
DstDesc
,
SrcCoordinate
,
DstCoordinate
,
SubLengths
>
;
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_block_slice_origin
,
DstCoordinate
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
DataClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
...
...
@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2
const
auto
thread_data_multi_id_begin
=
data_cluster_multi_id
*
SubLengths
{};
mThreadwiseCopy
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseCopy
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_multi_id_begin
);
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
mThreadwiseCopy
.
Run
(
p_src
,
p_dst
);
return
RegisterBufferDesc
::
GetElementSpace
(
);
}
template
<
class
SrcMergedDimSubLengthsHack
,
class
DstMergedDimSubLengthsHack
>
__device__
void
Run_hack
(
const
TData
*
p_src
,
TData
*
p_dst
,
SrcMergedDimSubLengthsHack
,
DstMergedDimSubLengthsHack
)
const
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
// hacks to isolate merged dimension from normal dimensions, and calculate their offset
// seperately
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
mThreadwiseCopy
.
Run_hack
(
p_src
,
p_dst
,
SrcMergedDimSubLengthsHack
{},
DstMergedDimSubLengthsHack
{});
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
}
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
mThreadwise
Copy
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
mThreadwise
Load
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
__device__
void
MoveDstSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
mThreadwise
Copy
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
mThreadwise
Store
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
// private:
ThreadwiseCopy
mThreadwiseCopy
;
private:
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
SrcDesc
,
RegisterBufferDesc
,
SrcCoordinate
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
SubLengths
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
RegisterBufferDesc
,
DstDesc
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
DstCoordinate
,
SubLengths
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
c01af899
...
...
@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2
mDstSliceOrigin
=
dst_slice_origin
;
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
template
<
class
TDesc
,
class
Seq
>
struct
IsolateMergedDimSliceLengthsHack
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
#if 0
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#elif
1
auto
src_slice_origin
=
mSrcSliceOrigin
;
auto
dst_slice_origin
=
mDstSliceOrigin
;
const
TData
*
p_src_tmp
=
p_src
+
src_slice_origin
.
RepositionOrigin
();
TData
*
p_dst_tmp
=
p_dst
+
dst_slice_origin
.
RepositionOrigin
();
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
data_id
)]
=
p_src_tmp
[(
src_slice_origin
+
data_id
).
GetOffset
()];
});
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_dst_tmp
[(
dst_slice_origin
+
data_id
).
GetOffset
()]
=
p_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
data_id
)];
});
#endif
}
template
<
class
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Seq
{}[
idim
]
:
1
;
}
};
template
<
class
SrcMergedDimSliceLengthsHack
,
class
DstMergedDimSliceLengthsHack
>
__device__
void
Run_hack
(
const
TData
*
p_src
,
TData
*
p_dst
,
SrcMergedDimSliceLengthsHack
,
DstMergedDimSliceLengthsHack
)
const
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
...
...
@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
using
SrcMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
SrcDesc
,
SliceLengths
>>::
type
;
using
SrcNormalDimSliceLengthsHack
=
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
SrcMergedDimSliceLengthsHack
{});
...
...
@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// but 1 on merged dimensions;
using
DstMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
DstDesc
,
SliceLengths
>>::
type
;
using
DstNormalDimSliceLengthsHack
=
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
DstMergedDimSliceLengthsHack
{});
...
...
composable_kernel/include/utility/Sequence.hpp
View file @
c01af899
...
...
@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
using
type
=
Sequence
<
Xs
...,
Ys
...
>
;
};
//
arithmetic sqe
uence
template
<
index_t
IBegin
,
index_t
N
Size
,
index_t
Increment
>
struct
arithmetic_
sequence_gen_impl
//
generate seq
uence
template
<
index_t
IBegin
,
index_t
N
Remain
,
class
F
>
struct
sequence_gen_impl
{
static
constexpr
index_t
NSizeLeft
=
NSize
/
2
;
static
constexpr
index_t
NRemainLeft
=
NRemain
/
2
;
static
constexpr
index_t
NRemainRight
=
NRemain
-
NRemainLeft
;
static
constexpr
index_t
IMiddle
=
IBegin
+
NRemainLeft
;
using
type
=
typename
sequence_merge
<
typename
arithmetic_sequence_gen_impl
<
IBegin
,
NSizeLeft
,
Increment
>::
type
,
typename
arithmetic_sequence_gen_impl
<
IBegin
+
NSizeLeft
*
Increment
,
NSize
-
NSizeLeft
,
Increment
>::
type
>::
type
;
using
type
=
typename
sequence_merge
<
typename
sequence_gen_impl
<
IBegin
,
NRemainLeft
,
F
>::
type
,
typename
sequence_gen_impl
<
IMiddle
,
NRemainRight
,
F
>::
type
>::
type
;
};
template
<
index_t
I
Begin
,
index_t
Increment
>
struct
arithmetic_
sequence_gen_impl
<
I
Begin
,
1
,
Increment
>
template
<
index_t
I
,
class
F
>
struct
sequence_gen_impl
<
I
,
1
,
F
>
{
using
type
=
Sequence
<
IBegin
>
;
static
constexpr
index_t
Is
=
F
{}(
Number
<
I
>
{});
using
type
=
Sequence
<
Is
>
;
};
template
<
index_t
I
Begin
,
index_t
Increment
>
struct
arithmetic_
sequence_gen_impl
<
I
Begin
,
0
,
Increment
>
template
<
index_t
I
,
class
F
>
struct
sequence_gen_impl
<
I
,
0
,
F
>
{
using
type
=
Sequence
<>
;
};
template
<
index_t
NSize
,
class
F
>
struct
sequence_gen
{
using
type
=
typename
sequence_gen_impl
<
0
,
NSize
,
F
>::
type
;
};
// arithmetic sequence
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
struct
arithmetic_sequence_gen
{
using
type
=
typename
arithmetic_sequence_gen_impl
<
IBegin
,
IEnd
-
IBegin
,
Increment
>::
type
;
struct
F
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
i
)
const
{
return
i
*
Increment
+
IBegin
;
}
};
using
type
=
typename
sequence_gen
<
(
IEnd
-
IBegin
)
/
Increment
,
F
>::
type
;
};
// uniform sequence
template
<
index_t
NSize
,
index_t
I
>
struct
uniform_sequence_gen
{
struct
return_constant
struct
F
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
)
const
{
return
I
;
}
};
using
type
=
decltype
(
typename
arithmetic_sequence_gen
<
0
,
NSize
,
1
>::
type
{}.
Transform
(
return_constant
{}));
using
type
=
typename
sequence_gen
<
NSize
,
F
>::
type
;
};
// reverse inclusive scan (with init) sequence
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
...
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
...
@@ -4,6 +4,7 @@
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
...
...
driver/src/driver.cpp
View file @
c01af899
...
...
@@ -532,7 +532,7 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment