Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1b3c2e40
Commit
1b3c2e40
authored
Aug 07, 2019
by
Chao Liu
Browse files
reworked ThreadwiseGenericTensorSliceCopy_v1
parent
41cdde99
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
284 additions
and
137 deletions
+284
-137
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+18
-3
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+113
-86
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+106
-38
composable_kernel/include/utility/functional2.hpp
composable_kernel/include/utility/functional2.hpp
+5
-3
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+35
-0
driver/src/driver.cpp
driver/src/driver.cpp
+7
-7
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1b3c2e40
...
@@ -170,6 +170,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -170,6 +170,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDstAccessOrder
,
2
,
3
,
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
...
@@ -213,6 +215,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -213,6 +215,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopyDstAccessOrder
,
0
,
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
(
WeiBlockCopyDstDataPerWrite_K
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
{
0
,
k_block_data_on_global
},
{
0
,
0
});
...
@@ -434,7 +438,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -434,7 +438,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if
1
#if
0
threadwise_generic_tensor_slice_copy_v1(
threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread,
p_out_thread,
...
@@ -445,9 +449,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -445,9 +449,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::type{},
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
Number<1>{});
#else
#elif
1
ThreadwiseGenericTensorSliceCopy_v1
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
,
0
,
0
,
1
,
1
>
({
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#elif 0
ThreadwiseGenericTensorSliceCopy_v2
<
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
1b3c2e40
...
@@ -19,7 +19,8 @@ namespace ck {
...
@@ -19,7 +19,8 @@ namespace ck {
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1.
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
...
@@ -28,10 +29,12 @@ template <index_t BlockSize,
...
@@ -28,10 +29,12 @@ template <index_t BlockSize,
class
SubLengths
,
class
SubLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
ThreadClusterArrangeOrder
,
class
SrcAccessOrder
,
class
SrcDimAccessOrder
,
class
DstAccessOrder
,
class
DstDimAccessOrder
,
index_t
SrcDataPerRead
,
index_t
SrcVectorAccessDim
,
index_t
DstDataPerWrite
>
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v1
struct
BlockwiseGenericTensorSliceCopy_v1
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
...
@@ -60,23 +63,22 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -60,23 +63,22 @@ struct BlockwiseGenericTensorSliceCopy_v1
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
__device__
__device__
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_id_begin
,
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_id_begin
)
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
{
{
// check NDim consistency
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
Dst
Desc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
Src
Desc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
nDim
==
Src
Dim
AccessOrder
::
GetSize
()
&&
nDim
==
Dst
Dim
AccessOrder
::
GetSize
(),
"wrong"
);
"wrong"
);
// check thread arrange order and read/write access order are valid
// check thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcAccessOrder
>::
value
&&
is_valid_sequence_map
<
Src
Dim
AccessOrder
>::
value
&&
is_valid_sequence_map
<
DstAccessOrder
>::
value
,
is_valid_sequence_map
<
Dst
Dim
AccessOrder
>::
value
,
"wrong!"
);
"wrong!"
);
// thread cluster
// thread cluster
...
@@ -142,20 +144,20 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -142,20 +144,20 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
// calculate mThreadSrcOffset, mThreadDstOffset
const
auto
thread_cluster_
multi_
id
=
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_
multi_
id
=
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_
multi_
id
,
ThreadClusterArrangeOrder
{});
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_
multi_
id_begin
=
data_cluster_
multi_
id
*
SubLengths
{};
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
// original multi-id
// original multi-id
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
src_block_data_
multi_
id_begin
+
thread_data_
multi_
id_begin
);
src_block_data_id_begin
+
thread_data_id_begin
);
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
dst_block_data_
multi_
id_begin
+
thread_data_
multi_
id_begin
);
dst_block_data_id_begin
+
thread_data_id_begin
);
// partial offset on each dimension
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
...
@@ -188,14 +190,16 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -188,14 +190,16 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
}
__device__
static
constexpr
index_t
GetRegisterBuffer
Size
()
__device__
static
constexpr
auto
GetRegisterBuffer
Descriptor
()
{
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
return
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
}
return
thread_tensor_desc
.
GetElementSpace
();
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
GetRegisterBufferDescriptor
().
GetElementSpace
();
}
}
__device__
void
RunLoadRegisterBuffer
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterBuffer
(
const
Float
*
__restrict__
p_src
,
...
@@ -208,50 +212,62 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -208,50 +212,62 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
src_thread_data_multi_id_begin
=
constexpr
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
buffer_data_
multi_
id_begin
=
repeat_
multi_
id
*
thread_sub_tensor_lengths
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_
multi_
id_begin
);
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
thread_
tenso
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_
multi_
id_begin
);
thread_
buffe
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#else
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_
multi_
id
)
{
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
src_thread_data_
multi_
id_begin
=
repeat_
multi_
id
*
data_per_cluster_per_dims
;
const
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
auto
buffer_data_
multi_
id_begin
=
repeat_
multi_
id
*
thread_sub_tensor_lengths
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
const
index_t
buffer_offset
=
const
index_t
buffer_offset
=
thread_
tenso
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_
multi_
id_begin
);
thread_
buffe
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#endif
#endif
// By position the origin of the per-thread window at the point, where multi-index
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// is assuming each thread is copy a noraml (not merged) tensor.
// User need to guarantee this is true.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// By setting SubLengths = 1 at the merged dimension, this is always true;
// that constains multiple original dimensions, the length of the last original
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// special care in implementation is needed
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
#if 0 // debug
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
p_src + src_offset + mThreadSrcOffset,
p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
thread_
tenso
r_desc
,
thread_
buffe
r_desc,
p_buffer + buffer_offset,
p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
thread_sub_tensor_lengths,
SrcAccessOrder
{},
SrcDimAccessOrder{},
Number
<
SrcDataPerRead
>
{});
Number<SrcDataPerAccess>{});
#else
ThreadwiseGenericTensorSliceCopy_v1
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
SrcVectorAccessDim
,
0
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
#endif
});
});
}
}
...
@@ -265,48 +281,60 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -265,48 +281,60 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_
multi_
id
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_
multi_
id_begin
=
repeat_
multi_
id
*
thread_sub_tensor_lengths
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_
multi_
id_begin
=
repeat_
multi_
id
*
data_per_cluster_per_dims
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
thread_
tenso
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_
multi_
id_begin
);
thread_
buffe
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#else
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_
multi_
id
)
{
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_
multi_
id_begin
=
repeat_
multi_
id
*
thread_sub_tensor_lengths
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_
multi_
id_begin
=
repeat_
multi_
id
*
data_per_cluster_per_dims
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
const
index_t
buffer_offset
=
thread_
tenso
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_
multi_
id_begin
);
thread_
buffe
r_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_
multi_
id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
#endif
// By position the origin of the per-thread window at the point, where multi-index
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// is assuming each thread is copy a noraml (not merged) tensor.
// User need to guarantee this is true.
// User need to guarantee this is true.
// By setting SubLengths = 1 at the merged dimension, this is always true;
// By setting SubLengths = 1 at the merged dimension, this is always true;
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
#if 0 // debug
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
p_buffer + buffer_offset,
p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
DstDesc{},
DstDesc{},
p_dst + dst_offset + mThreadDstOffset,
p_dst + dst_offset + mThreadDstOffset,
make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
thread_sub_tensor_lengths,
DstAccessOrder
{},
DstDimAccessOrder{},
Number
<
DstDataPerWrite
>
{});
Number<DstDataPerAccess>{});
#else
ThreadwiseGenericTensorSliceCopy_v1
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
typename
arithmetic_sequence_gen
<
0
,
nDim
,
1
>::
type
,
DstDimAccessOrder
,
0
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
#endif
});
});
}
}
...
@@ -346,26 +374,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -346,26 +374,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
// calculate new partial original multi-id
auto
old_src_partial_original_
multi_
id
=
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_
multi_
id
=
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_
multi_
id
,
StepSize
,
direction
);
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_
multi_
id
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
});
// calculate new partial offset on this merged dimension
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
new_src_partial_original_multi_id
);
// update "mThreadSrcPartialOffsets"
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
...
@@ -434,19 +461,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -434,19 +461,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_
multi_
id
=
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_
multi_
id
=
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_
multi_
id
,
ThreadClusterArrangeOrder
{});
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_
multi_
id_begin
=
data_cluster_
multi_
id
*
SubLengths
{};
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_
multi_
id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_
multi_
id_begin
);
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
1b3c2e40
...
@@ -106,7 +106,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
...
@@ -106,7 +106,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
#endif
#endif
}
}
#if
0
#if
1
template
<
class
SrcDesc
,
template
<
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SliceLengths
,
...
@@ -118,7 +118,7 @@ template <class SrcDesc,
...
@@ -118,7 +118,7 @@ template <class SrcDesc,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1
struct
ThreadwiseGenericTensorSliceCopy_v1
{
{
static constexpr index_t nDim = SliceLengths::Get
NumOfDimension
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Get
Size
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
Array
<
index_t
,
nDim
>
dst_slice_origin
)
...
@@ -130,39 +130,43 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -130,39 +130,43 @@ struct ThreadwiseGenericTensorSliceCopy_v1
nDim
==
DstDimAccessOrder
::
GetSize
(),
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
"wrong! # of dimensions not the same"
);
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::
{}
&&
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map<DstDimAccessOrder>::
{}
,
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
"wrong! map is not valid"
);
static_assert(SliceLengths{}[SrcVectorDim] % SrcDataPerAccess == 0 &&
static_assert
(
SliceLengths
{}[
SrcVector
Access
Dim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths{DstVectorDim
}
% DstDataPerAccess == 0,
SliceLengths
{
}[
DstVector
Access
Dim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
"wrong! cannot evenly divide"
);
// check vectorized memory access
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDIm>{};
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDIm>{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
src_vector_access_dim)>{}([&](auto fwd) {
[
&
](
auto
fwd
)
{
static_assert(
static_assert
(
(fwd(SrcDesc{}).GetStrides()[SrcVectorAccessDim] == 1 || SrcDataPerAccess == 1),
(
fwd
(
SrcDesc
{}).
GetStrides
()[
SrcVectorAccessDim
]
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1");
"wrong! vectorized access is allowed only if stride == 1"
);
}).Else{}([&](auto fwd) {
})
static_assert((SrcDesc::GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
.
Else
([
&
](
auto
fwd
)
{
SrcDataPerAccess == 1),
static_assert
(
"wrong! vectorized access is allowed only if stride == 1");
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
});
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
dst_vector_access_dim)>{}([&](auto fwd) {
[
&
](
auto
fwd
)
{
static_assert(
static_assert
(
(fwd(DstDesc{}).GetStrides()[DstVectorAccessDim] == 1 || DstDataPerAccess == 1),
(
fwd
(
DstDesc
{}).
GetStrides
()[
DstVectorAccessDim
]
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1");
"wrong! vectorized access is allowed only if stride == 1"
);
}).Else{}([&](auto fwd) {
})
static_assert((DstDesc::GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
.
Else
([
&
](
auto
fwd
)
{
DstDataPerAccess == 1),
static_assert
(
"wrong! vectorized access is allowed only if stride == 1");
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
});
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
()
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1
()
...
@@ -186,23 +190,87 @@ struct ThreadwiseGenericTensorSliceCopy_v1
...
@@ -186,23 +190,87 @@ struct ThreadwiseGenericTensorSliceCopy_v1
{
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData p_buffer[buffer_desc.GetElementSpace()];
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
// copy data from src into buffer
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDIm>{};
{
using
vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr auto src_
access_lengths = SliceLengths::Modify(
constexpr
auto
src_
vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
src_vector_access_dim, SliceLengths::Get(src_vector_access_dim) /
SrcDataPerAccess
)
;
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{}
;
constexpr auto src_access_lengths_in_src_access_order =
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_access_lengths.ReorderGivenNew2Old(SrcDimAccessOrder{});
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
static_ford<decltype(src_access_lengths_in_src_access_order)>{}([&](auto src_access_id) {});
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
constexpr
auto
src_data_id
=
src_access_id
.
Modify
(
src_vector_access_dim
,
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
}
// copy data from buffer to dst
{
using
vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
}
}
}
private:
private:
Array<index_t,
TData
> mSrcSliceOrigin;
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array<index_t,
TData
> mDstSliceOrigin;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
};
#endif
#endif
...
...
composable_kernel/include/utility/functional2.hpp
View file @
1b3c2e40
...
@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
...
@@ -23,14 +23,16 @@ struct static_for_impl<Sequence<Is...>>
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
struct
static_for
struct
static_for
{
{
template
<
class
F
>
__host__
__device__
constexpr
static_for
()
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
}
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
type
>
{}(
f
);
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
type
>
{}(
f
);
}
}
};
};
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
1b3c2e40
...
@@ -94,6 +94,41 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -94,6 +94,41 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
// each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
2
;
#elif 0
#elif 0
// each thread hold 32 data
// each thread hold 32 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/src/driver.cpp
View file @
1b3c2e40
...
@@ -9,14 +9,14 @@
...
@@ -9,14 +9,14 @@
#include "conv_common.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "host_conv.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
//
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment