Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
a9b2b1dc
You need to sign in or sign up before continuing.
Commit
a9b2b1dc
authored
Aug 08, 2019
by
Chao Liu
Browse files
added ThreadwiseGenericTensorSliceCopy_v2r1
parent
701b7341
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
457 additions
and
29 deletions
+457
-29
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+29
-10
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+27
-8
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+35
-1
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+312
-0
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+1
-0
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+1
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+5
-2
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+47
-8
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
a9b2b1dc
...
@@ -47,7 +47,8 @@ template <index_t GridSize,
...
@@ -47,7 +47,8 @@ template <index_t GridSize,
class
WeiBlockCopySrcAccessOrder
,
class
WeiBlockCopySrcAccessOrder
,
class
WeiBlockCopyDstAccessOrder
,
class
WeiBlockCopyDstAccessOrder
,
index_t
WeiBlockCopySrcDataPerRead_E
,
index_t
WeiBlockCopySrcDataPerRead_E
,
index_t
WeiBlockCopyDstDataPerWrite_K
>
index_t
WeiBlockCopyDstDataPerWrite_K
,
index_t
OutThreadCopyDataPerAccess_W
>
struct
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
struct
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
{
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
...
@@ -155,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -155,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
"GemmDataPerReadB alignment requirement is not satisfied"
);
#if
1
#if
0
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
// this copy operator already has blockwise offset built-in
...
@@ -184,7 +185,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -184,7 +185,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
2
,
3
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#endif
#endif
// weight tensor
// weight tensor
...
@@ -198,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -198,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
#if
1
#if
0
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
...
@@ -227,7 +234,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -227,7 +234,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
0
,
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
#endif
#endif
// GEMM definition
// GEMM definition
...
@@ -322,7 +335,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -322,7 +335,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
#if
1
#if
0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
// True);
// True);
...
@@ -354,7 +367,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -354,7 +367,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_in_register_buffer
[
blockwise_in_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
#if
1
#if
0
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
...
@@ -434,7 +447,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -434,7 +447,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if
1
#if
0
ThreadwiseGenericTensorSliceCopy_v1r2<
ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...
@@ -445,12 +458,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -445,12 +458,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run(p_out_thread, p_out_thread_on_global);
.Run(p_out_thread, p_out_thread_on_global);
#elif
1
#elif
1
ThreadwiseGenericTensorSliceCopy_v2
<
ThreadwiseGenericTensorSliceCopy_v2
r1
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
7
,
7
,
1
,
1
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
#endif
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
a9b2b1dc
...
@@ -44,7 +44,8 @@ template <index_t GridSize,
...
@@ -44,7 +44,8 @@ template <index_t GridSize,
class
WeiBlockCopySrcAccessOrder
,
class
WeiBlockCopySrcAccessOrder
,
class
WeiBlockCopyDstAccessOrder
,
class
WeiBlockCopyDstAccessOrder
,
index_t
WeiBlockCopySrcDataPerRead_E
,
index_t
WeiBlockCopySrcDataPerRead_E
,
index_t
WeiBlockCopyDstDataPerWrite_K
>
index_t
WeiBlockCopyDstDataPerWrite_K
,
index_t
OutThreadCopyDataPerAccess_B
>
struct
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
struct
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
{
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
...
@@ -138,7 +139,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -138,7 +139,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
decltype
(
in_e_b_block_desc
.
GetLengths
()),
decltype
(
in_e_b_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_B
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
>
(
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
1
,
1
,
InBlockCopyDataPerAccess_B
,
InBlockCopyDataPerAccess_B
>
(
{
0
,
b_block_data_on_global
},
{
0
,
0
});
{
0
,
b_block_data_on_global
},
{
0
,
0
});
// weight tensor
// weight tensor
...
@@ -164,7 +171,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -164,7 +171,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
0
,
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -349,15 +362,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -349,15 +362,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
using
OutThreadCopySliceLengths
=
using
OutThreadCopySliceLengths
=
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>
;
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>
;
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
r1
<
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_global_desc
),
decltype
(
out_k0_k1_b_global_desc
),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
OutThreadCopySliceLengths
>
({
0
,
0
,
0
},
OutThreadCopySliceLengths
,
{
k_thread_data_on_global
/
K1
,
arithmetic_sequence_gen
<
0
,
3
,
1
>::
type
,
k_thread_data_on_global
%
K1
,
arithmetic_sequence_gen
<
0
,
3
,
1
>::
type
,
b_thread_data_on_global
});
2
,
2
,
OutThreadCopyDataPerAccess_B
,
OutThreadCopyDataPerAccess_B
>
({
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
b_thread_data_on_global
});
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
a9b2b1dc
...
@@ -412,7 +412,13 @@ template <index_t BlockSize,
...
@@ -412,7 +412,13 @@ template <index_t BlockSize,
class
SliceLengths
,
class
SliceLengths
,
class
SubLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
>
class
ThreadClusterArrangeOrder
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v2
struct
BlockwiseGenericTensorSliceCopy_v2
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
...
@@ -496,6 +502,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -496,6 +502,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
private:
private:
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
#if 0
using ThreadwiseLoad =
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2<SrcDesc,
ThreadwiseGenericTensorSliceCopy_v2<SrcDesc,
RegisterBufferDesc,
RegisterBufferDesc,
...
@@ -509,6 +516,33 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -509,6 +516,33 @@ struct BlockwiseGenericTensorSliceCopy_v2
NormalTensorCoordinate<RegisterBufferDesc>,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
DstCoordinate,
SubLengths>;
SubLengths>;
#else
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
RegisterBufferDesc
,
SrcCoordinate
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
RegisterBufferDesc
,
DstDesc
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
DstCoordinate
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
#endif
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
};
};
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
a9b2b1dc
...
@@ -18,6 +18,10 @@
...
@@ -18,6 +18,10 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace
ck
{
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
...
@@ -590,5 +594,313 @@ struct ThreadwiseGenericTensorSliceCopy_v2
...
@@ -590,5 +594,313 @@ struct ThreadwiseGenericTensorSliceCopy_v2
DstCoordinate
mDstSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
};
#if 1
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
class
SrcDesc
,
class
DstDesc
,
class
SrcCoordinate
,
class
DstCoordinate
,
class
SliceLengths
,
class
SrcDimAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
SrcCoordinate
src_slice_origin
,
DstCoordinate
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
()
:
ThreadwiseGenericTensorSliceCopy_v2r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
class
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
template
<
class
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
class
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id_
)
{
constexpr
auto
src_merged_dim_access_id
=
decltype
(
src_merged_dim_access_id_
){};
constexpr
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id_
)
{
constexpr
auto
src_normal_dim_access_id
=
decltype
(
src_normal_dim_access_id_
){};
constexpr
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
constexpr
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
});
#else
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// these should be compile-time known
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
#endif
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id_
)
{
constexpr
auto
dst_merged_dim_access_id
=
decltype
(
dst_merged_dim_access_id_
){};
constexpr
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id_
)
{
constexpr
auto
dst_normal_dim_access_id
=
decltype
(
dst_normal_dim_access_id_
){};
constexpr
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
constexpr
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#endif
}
}
// T can be Sequence or Array
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSlicingWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/config_amd.hpp.in
View file @
a9b2b1dc
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
namespace ck {
namespace ck {
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
a9b2b1dc
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
namespace ck {
namespace ck {
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
a9b2b1dc
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template
<
class
T
,
template
<
class
T
,
...
@@ -94,6 +94,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -94,6 +94,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
OutThreadCopyDataPerAccess_W
=
1
;
#elif 1
#elif 1
// each thread hold 64 data
// each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -214,7 +216,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -214,7 +216,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_W
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
a9b2b1dc
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -55,7 +55,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -55,7 +55,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
#if 1
// 1x1 filter, 8x8 image
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
...
@@ -86,8 +85,45 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -86,8 +85,45 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
1
;
#elif 0 // debug
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
1
,
4
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
8
,
32
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
InBlockCopyDataPerAccess_B
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
1
;
#elif 1
#elif 1
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -106,13 +142,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -106,13 +142,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
2
,
2
>
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
1
,
4
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
4
,
64
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
8
,
32
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
InBlockCopyDataPerAccess_B
=
1
;
constexpr
index_t
InBlockCopyDataPerAccess_B
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
...
@@ -120,8 +156,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -120,8 +156,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
OutThreadCopyDataPerAccess_B
=
4
;
#endif
#endif
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
...
@@ -169,7 +207,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -169,7 +207,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
WeiBlockCopyDstDataPerWrite_K
,
OutThreadCopyDataPerAccess_B
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment